Merge pull request #231 from Hestia-Homes/refactor-data-processor

Refactor data processor
This commit is contained in:
KhalimCK 2023-09-15 15:40:12 +01:00 committed by GitHub
commit 402d71eb77
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 592 additions and 5773 deletions

View file

@ -9,4 +9,5 @@ omit =
model_data/plotting/*
recommendations/rdsap_tables.py
model_data/simulation_system/*
model_data/cleaner_app.py
model_data/cleaner_app.py
backend/app/*

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -11,4 +11,4 @@ db_string = connection_string.format(
dbname=get_settings().DB_NAME,
)
db_engine = create_engine(db_string)
db_engine = create_engine(db_string, pool_size=20, max_overflow=5)

View file

@ -1,32 +0,0 @@
columntypes = {
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
'CONSTRUCTION_AGE_BAND': 'object', 'TRANSACTION_TYPE_STARTING': 'object',
'WALLS_DESCRIPTION_STARTING': 'object',
'FLOOR_DESCRIPTION_STARTING': 'object', 'LIGHTING_DESCRIPTION_STARTING': 'object',
'ROOF_DESCRIPTION_STARTING': 'object', 'MAINHEAT_DESCRIPTION_STARTING': 'object',
'HOTWATER_DESCRIPTION_STARTING': 'object', 'MAIN_FUEL_STARTING': 'object',
'MECHANICAL_VENTILATION_STARTING': 'object',
'SECONDHEAT_DESCRIPTION_STARTING': 'object', 'ENERGY_TARIFF_STARTING': 'object',
'SOLAR_WATER_HEATING_FLAG_STARTING': 'object', 'PHOTO_SUPPLY_STARTING': 'float64',
'WINDOWS_DESCRIPTION_STARTING': 'object', 'GLAZED_TYPE_STARTING': 'object',
'MULTI_GLAZE_PROPORTION_STARTING': 'float64', 'LOW_ENERGY_LIGHTING_STARTING': 'float64',
'NUMBER_OPEN_FIREPLACES_STARTING': 'float64', 'MAINHEATCONT_DESCRIPTION_STARTING': 'object',
'EXTENSION_COUNT_STARTING': 'float64', 'LODGEMENT_DATE_STARTING': 'object',
'TRANSACTION_TYPE_ENDING': 'object',
'WALLS_DESCRIPTION_ENDING': 'object', 'FLOOR_DESCRIPTION_ENDING': 'object',
'LIGHTING_DESCRIPTION_ENDING': 'object',
'ROOF_DESCRIPTION_ENDING': 'object', 'MAINHEAT_DESCRIPTION_ENDING': 'object',
'HOTWATER_DESCRIPTION_ENDING': 'object',
'MAIN_FUEL_ENDING': 'object', 'MECHANICAL_VENTILATION_ENDING': 'object',
'SECONDHEAT_DESCRIPTION_ENDING': 'object',
'ENERGY_TARIFF_ENDING': 'object', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'object',
'PHOTO_SUPPLY_ENDING': 'float64',
'WINDOWS_DESCRIPTION_ENDING': 'object', 'GLAZED_TYPE_ENDING': 'object',
'MULTI_GLAZE_PROPORTION_ENDING': 'float64',
'LOW_ENERGY_LIGHTING_ENDING': 'float64', 'NUMBER_OPEN_FIREPLACES_ENDING': 'float64',
'MAINHEATCONT_DESCRIPTION_ENDING': 'object', 'EXTENSION_COUNT_ENDING': 'float64',
'LODGEMENT_DATE_ENDING': 'object',
'id': 'object'
}

View file

@ -8,8 +8,10 @@ from backend.app.config import get_settings
from backend.Property import Property
from epc_api.client import EpcClient
from utils.logger import setup_logger
from utils.s3 import read_from_s3
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.WallRecommendations import WallRecommendations
from recommendations.config import UPGRADES_MAP
from utils.uvalue_estimates import classify_decile_newvalues
from backend.app.db.utils import row2dict
from starlette.responses import Response
@ -17,6 +19,7 @@ from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError, OperationalError
from datetime import datetime
import pandas as pd
import msgpack
# model apis
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
@ -31,21 +34,17 @@ from backend.app.db.functions.recommendations_functions import (
)
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
from backend.app.db.connection import db_engine
from backend.app.plan.columntypes import columntypes
from model_data.optimiser.GainOptimiser import GainOptimiser
from model_data.optimiser.CostOptimiser import CostOptimiser
from backend.app.utils import epc_to_sap_lower_bound, save_dataframe_to_s3_parquet, read_parquet_from_s3
from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
from model_data.optimiser.optimiser_functions import prepare_input_measures
from model_data.simulation_system.core.DataProcessor import DataProcessor
from model_data.simulation_system.core.Settings import (
FIXED_FEATURES, COMPONENT_FEATURES, COLUMNS_TO_MERGE_ON
)
from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
# TODO: This is placeholder until data is stored in DB
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
from backend.app.plan.temp_cleaned_data import cleaned
logger = setup_logger()
@ -98,12 +97,6 @@ walls_decile_data = {
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
120., 2279.]}
lighting_averages = [
{'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
{'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
{'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
]
def filter_materials(materials):
materials_by_type = defaultdict(list)
@ -138,18 +131,62 @@ def insert_temp_recommendation_id(property_recommendations):
return property_recommendations
def score_measures():
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def create_recommendation_scoring_data(
property: Property,
recommendation: dict,
starting_epc_data: pd.DataFrame,
ending_epc_data: pd.DataFrame,
fixed_data: pd.DataFrame,
):
"""
This wrapper function prepares data to be passed to the sap model api
:return:
"""
scoring_dict = {
"UPRN": property.data["uprn"],
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
"LOCAL_AUTHORITY": property.data["local-authority"],
**starting_epc_data.to_dict("records")[0],
**ending_epc_data.to_dict("records")[0],
**fixed_data.to_dict("records")[0]
}
# We update the description to indicate it's insulated
if recommendation["type"] == "wall_insulation":
scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
elif recommendation["type"] == "floor_insulation":
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
else:
raise NotImplementedError("Implement me")
return scoring_dict
@router.post("/trigger")
async def trigger_plan(body: PlanTriggerRequest):
logger.info("Connecting to db")
Session = sessionmaker(bind=db_engine)
session = Session()
session = sessionmaker(bind=db_engine)()
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
try:
session.begin()
@ -159,21 +196,17 @@ async def trigger_plan(body: PlanTriggerRequest):
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
input_properties = []
for config in plan_input:
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
# TODO: implment validation
# Create a record in db
property_id, is_new = create_property(
session, portfolio_id=body.portfolio_id, address=config['address'], postcode=config['postcode']
)
# if a new record was not created, we don't produduce recommendations
if not is_new:
continue
# TODO: Need to add heat demand target
create_property_targets(
session,
@ -195,37 +228,34 @@ async def trigger_plan(body: PlanTriggerRequest):
if not input_properties:
return Response(status_code=204)
logger.info("Getting EPC data")
logger.info("Getting EPC, coordinates and conservation area data")
for p in input_properties:
p.search_address_epc()
p.set_year_built()
logger.info("Getting coordinates")
# This is placeholder, until the full dataset is loaded into the database
for p in input_properties:
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
p.set_coordinates(coordinate_data)
logger.info("Check if property is in conservation area")
for p in input_properties:
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
"is_in_conservation_area"
)
p.set_is_in_conservation_area(in_conservation_area)
# The materials data could be cached or local so we don't need to make
# consistent requrests to the backend for
# consistent requests to the backend for
# the same data
# TODO: It might not be the best choice to store the materials data in a database table since thi
# table probably won't be very large and won't be updated that often. It might be better to
# store this data in s3 load it into memory when the app starts up. We will test this
logger.info("Reading in materials and cleaned datasets")
materials = get_materials(session)
materials_by_type = filter_materials(materials)
cleaned = get_cleaned()
logger.info("Getting components and properties recommendations")
# TODO: Move this to a class. We probably was a Recommender class which takes the injects the optimisers
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
recommendations = {}
recommendations_scoring_data = []
@ -310,42 +340,33 @@ async def trigger_plan(body: PlanTriggerRequest):
# TODO: We should use the cleaned data from get_components in the data rather than the raw
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
# data
epc_data = p.data.copy()
epc_data = pd.DataFrame([epc_data])
epc_data.columns = [col.upper().replace("-", "_") for col in epc_data.columns]
starting_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_STARTING")
ending_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_ENDING")
fixed_data = epc_data[FIXED_FEATURES]
data_processor = DataProcessor(None, newdata=True)
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
data_processor.pre_process()
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
fixed_data = data_processor.get_fixed_features()
# We update the ending record with the recommended updates and we set lodgement date to today
ending_epc_data["LODGEMENT_DATE_ENDING"] = datetime.now().strftime("%Y-%m-%d")
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
scoring_map = {
'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
}
for recommendations_by_type in property_recommendations:
for rec in recommendations_by_type:
scoring_dict = {
"UPRN": p.data["uprn"],
"id": "+".join([str(p.id), str(rec["recommendation_id"])]),
"LOCAL_AUTHORITY": p.data["local-authority"],
**starting_epc_data.to_dict("records")[0],
**ending_epc_data.to_dict("records")[0],
**fixed_data.to_dict("records")[0]
}
# We update the description to indicate it's insulated
if rec["type"] == "wall_insulation":
scoring_dict["WALLS_DESCRIPTION_ENDING"] = scoring_map[p.walls["clean_description"]]
elif rec["type"] == "floor_insulation":
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = scoring_map[p.floor["clean_description"]]
else:
raise NotImplementedError("Implement me")
scoring_dict = create_recommendation_scoring_data(
property=p,
recommendation=rec,
starting_epc_data=starting_epc_data,
ending_epc_data=ending_epc_data,
fixed_data=fixed_data,
)
recommendations_scoring_data.append(scoring_dict)
# cleanup
del data_processor
logger.info("Preparing data for scoring in sap change api")
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
@ -354,61 +375,31 @@ async def trigger_plan(body: PlanTriggerRequest):
cleaning_data = read_parquet_from_s3(
bucket_name=get_settings().DATA_BUCKET,
file_key="sap_change_model/cleaning_dataset.parquet",
)
cleaning_data = cleaning_data.rename(columns={"local-authority": "LOCAL_AUTHORITY"})
# Merge the cleaning data onto recommendations_scoring_data
).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
recommendations_scoring_data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = recommendations_scoring_data[
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
].replace("", None)
# Merge the cleaning data onto recommendations_scoring_data
# Perform the same cleaning as in the model
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
data_to_clean=recommendations_scoring_data,
cleaning_data=cleaning_data,
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
).drop(columns=["LOCAL_AUTHORITY"])
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
file_location = sap_change_model_api.upload_scoring_data(
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
)
recommendations_scoring_data = recommendations_scoring_data.drop(columns=["LOCAL_AUTHORITY"])
# Note: We might need to perform the full pre-processing here
data_processor = DataProcessor(filepath=None)
data_processor.insert_data(recommendations_scoring_data)
data_processor.remap_columns()
recommendations_scoring_data = data_processor.data
# Remap column types
recommendations_scoring_data = recommendations_scoring_data.astype(columntypes)
# Store parquet file in s3 for scoring
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
portfolio_id=body.portfolio_id,
timestamp=created_at
)
logger.info("Storing scoring data to s3")
save_dataframe_to_s3_parquet(
df=recommendations_scoring_data,
bucket_name=get_settings().DATA_BUCKET,
file_key=file_location
)
logger.info("Making request to sap change api")
sap_change_model_api = SAPChangeModelAPI()
response = sap_change_model_api.predict(
file_location="s3://{DATA_BUCKET}/".format(DATA_BUCKET=get_settings().DATA_BUCKET) + file_location,
created_at=created_at,
portfolio_id=body.portfolio_id
)
# Retrieve the predictions
predictions = pd.DataFrame(read_csv_from_s3(
bucket_name=get_settings().PREDICTIONS_BUCKET,
filepath=response["storage_filepath"]
))
predictions = pd.DataFrame(
read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
)
# We round the predictions
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(0)
# Extract property_id and recommendation_id
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
# Insert the predictions into the recommendations and run the optimiser
@ -424,7 +415,7 @@ async def trigger_plan(body: PlanTriggerRequest):
rec["recommendation_id"]
)]["RDSAP_CHANGE"].values[0]
if not rec["sap_points"]:
if rec["sap_points"] is None:
raise ValueError("Sap points missing")
input_measures = prepare_input_measures(recommendations[property_id], body.goal)

File diff suppressed because it is too large Load diff

View file

@ -1,32 +1,71 @@
import pandas as pd
import requests
from requests.exceptions import RequestException
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
logger = setup_logger()
class SAPChangeModelAPI:
def __init__(self, base_url="https://api.dev.hestia.homes"):
def __init__(
self,
portfolio_id,
timestamp,
base_url="https://api.dev.hestia.homes",
):
"""
property_id (int, optional): :
:param portfolio_id: The portfolio ID to be passed in the request payload. Defaults to 4.
:param timestamp: The creation timestamp to be passed in the request payload. Defaults to None.
:param base_url:
"""
self.base_url = base_url
self.portfolio_id = portfolio_id
self.timestamp = timestamp
def predict(self, file_location, property_id="", portfolio_id=4, created_at=None):
def upload_scoring_data(self, df: pd.DataFrame, bucket: str) -> str:
"""
The sap model api needs a scoring data that is sitting in s3 to use as a dataset to score on
This method allows the user to upload a table as a parquet file. This method will return the file
location, which can be used as the file location in the predict() method
:param df: Pandas dataframe with scoring data to be uploaded to s3
:param bucket: Name of the bucket in s3 to upload to
:return:
"""
# Store parquet file in s3 for scoring
file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
portfolio_id=self.portfolio_id,
timestamp=self.timestamp
)
logger.info("Storing scoring data to s3")
save_dataframe_to_s3_parquet(
df=df,
bucket_name=bucket,
file_key=file_location
)
return file_location
def predict(self, file_location):
"""Makes a POST request to the SAP Change Model API with the provided parameters.
Args:
file_location (str): The file location to be passed in the request payload.
property_id (int, optional): The property ID to be passed in the request payload. Defaults to 999.
portfolio_id (int, optional): The portfolio ID to be passed in the request payload. Defaults to 4.
created_at (str, optional): The creation timestamp to be passed in the request payload. Defaults to None.
Returns:
dict: The API response as a dictionary if the request was successful, None otherwise.
"""
logger.info("Making request to sap change api")
url = f"{self.base_url}/sapmodel/predict"
payload = {
"file_location": f"s3://retrofit-data-dev/{file_location}",
"property_id": property_id,
"portfolio_id": portfolio_id,
"created_at": created_at
"property_id": "", # This should get removed
"portfolio_id": self.portfolio_id,
"created_at": self.timestamp
}
try:

View file

@ -1,3 +1,4 @@
msgpack==1.0.5
anyio==3.7.1
cffi==1.15.1
click==8.1.3

View file

@ -2,7 +2,7 @@ import pytest
import pandas as pd
from unittest.mock import Mock
from epc_api.client import EpcClient
from model_data.Property import Property
from backend.Property import Property
from open_uprn.OpenUprnClient import OpenUprnClient
from model_data.EpcClean import EpcClean
@ -19,6 +19,18 @@ mock_epc_response = {
"hotwater-description": "Hot Water Description",
"transaction-type": "rental",
"lighting-description": "Good Lighting Efficiency",
"energy-consumption-current": "50",
"co2-emissions-current": "123",
"mechanical-ventilation": "natural",
'photo-supply': 0,
"solar-water-heating-flag": "N",
"wind-turbine-count": 0,
"extension-count": 0,
"heat-loss-corridor": "no corridor",
"unheated-corridor-length": 0,
"mains-gas-flag": "Y",
"floor-height": 2.5,
"total-floor-area": 100
},
{
"inspection-date": "2023-05-01",
@ -30,6 +42,18 @@ mock_epc_response = {
"hotwater-description": "Hot Water Description",
"transaction-type": "rental",
"lighting-description": "Good Lighting Efficiency",
"energy-consumption-current": "50",
"co2-emissions-current": "123",
"mechanical-ventilation": "natural",
'photo-supply': 0,
"solar-water-heating-flag": "N",
"wind-turbine-count": 0,
"extension-count": 0,
"heat-loss-corridor": "no corridor",
"unheated-corridor-length": 0,
"mains-gas-flag": "Y",
"floor-height": 2.5,
"total-floor-area": 100
}
]
}
@ -42,6 +66,18 @@ mock_epc_response_dupe = {
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
"transaction-type": "rental",
"lighting-description": "Good Lighting Efficiency",
"energy-consumption-current": "50",
"co2-emissions-current": "123",
"mechanical-ventilation": "natural",
'photo-supply': 0,
"solar-water-heating-flag": "N",
"wind-turbine-count": 0,
"extension-count": 0,
"heat-loss-corridor": "no corridor",
"unheated-corridor-length": 0,
"mains-gas-flag": "Y",
"floor-height": 2.5,
"total-floor-area": 100
},
{
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
@ -50,6 +86,18 @@ mock_epc_response_dupe = {
'hotwater-description': 'Hot Water Description',
"transaction-type": "rental",
"lighting-description": "Good Lighting Efficiency",
"energy-consumption-current": "50",
"co2-emissions-current": "123",
"mechanical-ventilation": "natural",
'photo-supply': 0,
"solar-water-heating-flag": "N",
"wind-turbine-count": 0,
"extension-count": 0,
"heat-loss-corridor": "no corridor",
"unheated-corridor-length": 0,
"mains-gas-flag": "Y",
"floor-height": 2.5,
"total-floor-area": 100
},
{
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
@ -58,6 +106,18 @@ mock_epc_response_dupe = {
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
"transaction-type": "rental",
"lighting-description": "Good Lighting Efficiency",
"energy-consumption-current": "50",
"co2-emissions-current": "123",
"mechanical-ventilation": "natural",
'photo-supply': 0,
"solar-water-heating-flag": "N",
"wind-turbine-count": 0,
"extension-count": 0,
"heat-loss-corridor": "no corridor",
"unheated-corridor-length": 0,
"mains-gas-flag": "Y",
"floor-height": 2.5,
"total-floor-area": 100
}
]
}
@ -66,11 +126,11 @@ mock_epc_response_dupe = {
class TestProperty:
@pytest.fixture(autouse=True)
def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
return Property("AB12CD", "Test Address", epc_client=mock_epc_client)
return Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
@pytest.fixture(autouse=True)
def property_instance_dupe_data(self, mock_epc_client_dupe_data):
return Property("AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)
return Property(2, "AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)
@pytest.fixture
def mock_epc_client(self):
@ -99,15 +159,26 @@ class TestProperty:
@pytest.fixture
def mock_cleaner(self):
mock_cleaner = Mock(spec=EpcClean(data=[
{"roof-description": "Roof Description"},
{"walls-description": "Walls Description"},
{"windows-description": "Windows Description"},
{"mainheat-description": "Main Heating Description"},
{"hotwater-description": "Hot Water Description"},
{"lighting-description": "Good Lighting Efficiency"},
{"low-energy-lighting": 0}
]))
lighting_averages = [
{'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
{'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
{'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
]
cleaner_spec = EpcClean(
data=[
{"roof-description": "Roof Description"},
{"walls-description": "Walls Description"},
{"windows-description": "Windows Description"},
{"mainheat-description": "Main Heating Description"},
{"hotwater-description": "Hot Water Description"},
{"lighting-description": "Good Lighting Efficiency"},
{"low-energy-lighting": 0}
],
lighting_averages=lighting_averages
)
mock_cleaner = Mock(spec=cleaner_spec)
mock_cleaner.cleaned = {
"roof-description": [{"original_description": "Roof Description"}],
"walls-description": [{"original_description": "Walls Description"}],
@ -119,14 +190,14 @@ class TestProperty:
return mock_cleaner
def test_init(self, mock_epc_client):
inst1 = Property("AB12CD", "Test Address", epc_client=mock_epc_client)
inst1 = Property(0, "AB12CD", "Test Address", epc_client=mock_epc_client)
# Should be mocked auth token
assert inst1.epc_client.auth_token == "mocked_auth_token"
inst2 = Property("AB12CD", "Test Address")
inst2 = Property(3, "AB12CD", "Test Address")
assert inst2.epc_client.auth_token
inst3 = Property("AB12CD", "Test Address", data={"some": "data"})
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
assert inst3.data == {"some": "data"}
data = inst3.search_address_epc()
@ -143,29 +214,9 @@ class TestProperty:
with pytest.raises(Exception, match="More than one result found for this address - investigate me"):
property_instance_dupe_data.search_address_epc()
def test_get_coordinates(self, property_instance, mock_open_uprn_client):
# Set up the mock OpenUprnClient
property_instance.data = {"uprn": 12345}
property_instance.get_coordinates(mock_open_uprn_client)
# Verify that the coordinates are set correctly
assert property_instance.coordinates == {
"uprn": 12345,
"longitude": 1.2345,
"latitude": 2.3456
}
def test_get_coordinates_without_open_uprn_data(self, property_instance, mock_open_uprn_client):
# Modify the mock OpenUprnClient to not have read any data
mock_open_uprn_client.data = None
# Verify that ValueError is raised when OpenUprnClient data is None
with pytest.raises(ValueError, match="OpenUprnClient has not read data"):
property_instance.get_coordinates(mock_open_uprn_client)
def test_get_components(self, property_instance, mock_cleaner, mock_epc_client):
property_instance.search_address_epc()
property_instance.get_components(mock_cleaner)
property_instance.get_components(mock_cleaner.cleaned)
# Verify that the components are set correctly
assert property_instance.roof == {"original_description": "Roof Description"}
@ -180,7 +231,7 @@ class TestProperty:
# Verify that ValueError is raised when EpcClean doesn't contain cleaned data
with pytest.raises(ValueError, match="Cleaner does not contain cleaned data"):
property_instance.get_components(mock_cleaner)
property_instance.get_components(mock_cleaner.cleaned)
def test_get_components_no_data(self, property_instance, mock_cleaner):
# Modify the mock cleaner to have no attributes for a specific description
@ -190,7 +241,7 @@ class TestProperty:
# Verify that ValueError is raised when no attributes are found
with pytest.raises(ValueError, match="Property does not contain data"):
property_instance.get_components(mock_cleaner)
property_instance.get_components(mock_cleaner.cleaned)
def test_get_components_no_attributes(self, property_instance, mock_cleaner):
# Modify the mock cleaner to have no attributes for a specific description
@ -201,12 +252,12 @@ class TestProperty:
# Verify that ValueError is raised when no attributes are found
with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
property_instance.get_components(mock_cleaner)
property_instance.get_components(mock_cleaner.cleaned)
def test_get_components_multiple_attributes(self, property_instance, mock_cleaner):
# This shouldn't happen - it would mean a cleaning error
property_instance.search_address_epc()
mock_cleaner.cleaned = {
cleaned = {
"roof-description": [
{"original_description": "Roof Description"},
{"original_description": "Roof Description"}
@ -215,4 +266,4 @@ class TestProperty:
# Verify that ValueError is raised when multiple attributes are found
with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
property_instance.get_components(mock_cleaner)
property_instance.get_components(cleaned)

View file

@ -7,7 +7,7 @@ from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path
from model_data.utils import save_data_to_s3
from utils.s3 import save_data_to_s3
LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",

View file

@ -13,14 +13,28 @@ class LightingAttributes:
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
translation = self.WELSH_TEXT.get(self.description)
if translation:
self.nodata = False
self.description = translation
self.welsh_translation_search()
self.description = correct_spelling(self.description)
self.averages = averages
def welsh_translation_search(self):
"""
For welsh text describing the percentage of low energy lighting, we match the regular
expression and perform the translation
"""
lel_match = re.search(r"goleuadau ynni-isel mewn (\d+)%? ogçör mannau gosod", self.description)
if lel_match:
# Perform the actual translation
percentage = lel_match.group(1)
self.description = f"low energy lighting in {percentage}% of fixed outlets"
else:
translation = self.WELSH_TEXT.get(self.description)
if translation:
self.nodata = False
self.description = translation
def process(self):
description = self.description

View file

@ -12,6 +12,9 @@ from model_data.simulation_system.core.Settings import (
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,
COLUMNS_TO_MERGE_ON,
COMPONENT_FEATURES,
FIXED_FEATURES,
COLUMNTYPES
)
from typing import List
@ -21,9 +24,16 @@ class DataProcessor:
Handle data loading and data preprocessing
"""
def __init__(self, filepath: Path | None) -> None:
def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
"""
:param filepath: If specified, is the physical location of the data
:param newdata: Indicates if we are processing new, testing data.
In this instance, there are some operations we do not
want to perform, such as confine_data()
"""
self.filepath = filepath
self.data = None
self.newdata = newdata
def load_data(self, low_memory=False) -> None:
if not self.filepath:
@ -140,14 +150,30 @@ class DataProcessor:
break
to_index -= 1
def reformat_columns(self):
"""
This function applies the re-formattng of columns from lower case to capitalised
When requesting the epc data from the api, the columns are lower case
and separated by a hyphen, whereas in the bulk download, the columns
are capitalised and separated by underscores. If rename_columns is True
we convert the columns from lower case to capitalised format
:return:
"""
self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
def pre_process(self) -> pd.DataFrame:
"""
Load data and begin initial cleaning
"""
if not self.data:
if self.data is None:
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
self.confine_data()
if self.newdata:
self.reformat_columns()
if not self.newdata:
self.confine_data()
# We have some non-standard construction age bands which we'll clean for matching
self.standardise_construction_age_band()
@ -159,9 +185,10 @@ class DataProcessor:
self.clean_multi_glaze_proportion()
self.clean_photo_supply()
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
)
if not self.newdata:
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
)
self.remap_columns()
if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
@ -169,6 +196,8 @@ class DataProcessor:
self.fill_na_fields()
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
# Final re-casting after data transformed and prepared
self.data = self.data.astype(COLUMNTYPES)
return self.data
@ -178,6 +207,7 @@ class DataProcessor:
"""
# Each uprn can fille backward from recent and forward fill from oldest
# The groupby changes the order and we use the index to make the original data
filled_data = (
self.data.groupby("UPRN", group_keys=True)[columns_to_fill]
.apply(lambda group: group.fillna(method="bfill").fillna(method="ffill"))
@ -188,6 +218,11 @@ class DataProcessor:
self.data[columns_to_fill] = filled_data[columns_to_fill]
# For floor area, we also replace "" values with None
self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
].replace("", None)
def remap_columns(self):
"""
Remap all columns, for any non values
@ -430,3 +465,24 @@ class DataProcessor:
data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True)
return data_to_clean
def get_component_features(self, suffix: str) -> pd.DataFrame:
"""
This function will return the property components such as the walls, roof, heating etc
as well as lodgement date. These are features that we expect might change from one EPC to the
next
:param suffix: Should be one of "_STARTING" or "_ENDING"
:return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES
"""
if suffix not in ["_STARTING", "_ENDING"]:
raise Exception("Suffix should be one of _STARTING or _ENFING")
return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
def get_fixed_features(self) -> pd.DataFrame:
"""
Returns the fixed features that we don't believe should vary from one EPC to the next
:return: Pandas dataframe containing the columns defined in FIXED_FEATURES
"""
return self.data[FIXED_FEATURES]

View file

@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
def ordinal(n):
@ -167,3 +168,29 @@ DATA_PROCESSOR_SETTINGS = {
"epc_minimum_count": 1,
"column_mappings": {"UPRN": [int, str]},
}
# This has a manual mapping of the column types required
COLUMNTYPES = {
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
'CONSTRUCTION_AGE_BAND': 'object',
'TRANSACTION_TYPE': 'object',
'WALLS_DESCRIPTION': 'object',
'FLOOR_DESCRIPTION': 'object',
'LIGHTING_DESCRIPTION': 'object',
'ROOF_DESCRIPTION': 'object',
'MAINHEAT_DESCRIPTION': 'object',
'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
'MECHANICAL_VENTILATION': 'object',
'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
'WINDOWS_DESCRIPTION': 'object',
'GLAZED_TYPE': 'object',
'MULTI_GLAZE_PROPORTION': 'float64',
'LOW_ENERGY_LIGHTING': 'float64',
'NUMBER_OPEN_FIREPLACES': 'float64',
'MAINHEATCONT_DESCRIPTION': 'object',
'EXTENSION_COUNT': 'float64',
'LODGEMENT_DATE': 'object',
}

View file

@ -1,20 +1,161 @@
import pandas as pd
from tqdm import tqdm
import msgpack
from pathlib import Path
from simulation_system.core.Settings import (
from model_data.simulation_system.core.Settings import (
MANDATORY_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
EARLIEST_EPC_DATE
EARLIEST_EPC_DATE,
CARBON_RESPONSE,
)
from simulation_system.core.DataProcessor import DataProcessor
from utils import save_dataframe_to_s3_parquet
from model_data.simulation_system.core.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def process_and_prune_desriptions(df, cleaned_lookup):
"""
This method will merge on the cleaned lookup table and ensure that the building fabric in the
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
possible dataset.
:param df:
:param cleaned_lookup:
:return:
"""
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
# estimates, we well as estimated U-values
cols_to_drop = {
"walls": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
'is_sandstone_or_limestone', 'insulation_thickness',
'external_insulation', 'internal_insulation',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
'is_solid_brick_ENDING', 'is_system_built_ENDING',
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
'external_insulation_ENDING', 'internal_insulation_ENDING',
],
"floor": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'insulation_thickness', 'no_data',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
'another_property_below_ENDING', 'insulation_thickness_ENDING',
'no_data_ENDING',
],
"roof": [
'original_description', 'clean_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'is_valid', 'insulation_thickness',
'original_description_ENDING', 'clean_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
'insulation_thickness_ENDING',
]
}
for component in ["walls", "floor", "roof"]:
component_upper = component.upper()
df = df.merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_STARTING",
right_on="original_description",
).merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_ENDING",
right_on="original_description",
suffixes=("", "_ENDING")
)
if component == "walls":
# We make sure the wall construction hasn't changed
df = df[
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
(df["is_cob"] == df["is_cob_ENDING"]) &
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
]
elif component == "floor":
df = df[
(df["is_suspended"] == df["is_suspended_ENDING"]) &
(df["is_solid"] == df["is_solid_ENDING"]) &
(df["another_property_below"] == df["another_property_below_ENDING"]) &
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
]
else:
df = df[
(df["is_pitched"] == df["is_pitched_ENDING"]) &
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
(df["is_loft"] == df["is_loft_ENDING"]) &
(df["is_flat"] == df["is_flat_ENDING"]) &
(df["is_thatched"] == df["is_thatched_ENDING"]) &
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
]
# Drop the binary indicators and replace the original description with the cleaned version
# Drop original cols
original_cols = [
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
]
df = df.drop(
columns=cols_to_drop[component] + original_cols
).rename(
columns={
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
}
)
return df
def app():
@ -23,6 +164,8 @@ def app():
# Data glossary:
# https://epc.opendatacommunities.org/docs/guidance#glossary
cleaned_lookup = get_cleaned()
# List all subdirectories
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
@ -38,7 +181,7 @@ def app():
# TODO [x] : Have a look at temporal features
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
# TODO [x]: Same as floor area for floor height
# TODO []: If fundamental building fabric changes, we should proabably discard the record
# TODO [x]: If fundamental building fabric changes, we should proabably discard the record
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
# - leave for now and check performance after temporal features
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
@ -84,7 +227,7 @@ def app():
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = modified_property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -104,24 +247,29 @@ def app():
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = earliest_record[CARBON_RESPONSE]
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
if rdsap_change == 0:
continue
if gets_better:
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = latest_record[CARBON_RESPONSE]
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
if rdsap_change == 0:
continue
features = pd.concat([starting_record, ending_record])
property_model_data.append(
@ -129,8 +277,10 @@ def app():
"UPRN": uprn,
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
"STARTING_SAP": starting_sap,
"STARTING_HEAT_DEMAND": starting_heat_demand,
"CARBON_CHANGE": carbon_change,
"SAP_STARTING": starting_sap,
"HEAT_DEMAND_STARTING": starting_heat_demand,
"CARBON_STARTING": starting_carbon,
**fixed_data,
**features.to_dict(),
}
@ -152,6 +302,14 @@ def app():
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions
# We look for key building fabric features that have changed from one EPC to the next.
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
# is low
# We also replace descriptions with their cleaned variants
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
dataset.append(data_by_urpn_df)
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]

View file

@ -1,7 +1,3 @@
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
import pandas as pd
from io import BytesIO
import re
from textblob import TextBlob
@ -28,65 +24,3 @@ def correct_spelling(text):
corrected_text = ' '.join(corrected_words)
return corrected_text
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
"""
Save a pandas DataFrame to S3 as a Parquet file.
:param df: The pandas DataFrame.
:param bucket_name: Name of the S3 bucket.
:param file_key: Key of the file (including directory path within the bucket).
"""
# Convert the DataFrame to a Parquet format in memory
parquet_buffer = BytesIO()
df.to_parquet(parquet_buffer)
# Create the boto3 client
client = boto3.client('s3')
# Upload the Parquet file to S3
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
def save_data_to_s3(data, bucket_name, s3_file_name):
"""
Save an object to an S3 bucket
:param data: The data to save
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
try:
s3 = boto3.client('s3')
except NoCredentialsError:
print("Credentials not available.")
return
except PartialCredentialsError:
print("Incomplete credentials provided.")
return
try:
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
except Exception as e:
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
def read_from_s3(bucket_name, s3_file_name):
"""
Read an object from s3. Decoding of the data is left for outside of this function
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Initialize a session using Amazon S3
s3 = boto3.resource('s3')
# Get the MessagePack data from S3
obj = s3.Object(bucket_name, s3_file_name)
data = obj.get()['Body'].read()
return data

View file

@ -1,4 +1,4 @@
[pytest]
pythonpath = .
addopts = --cov-report term-missing --cov=model_data --cov=recommendations
testpaths = model_data/tests recommendations/tests
testpaths = model_data/tests recommendations/tests backend/tests

View file

@ -0,0 +1,8 @@
# This map defines the upgrades that are possible to be recommended by the recommendation engine
# For example,
# TODO: once we use cleaned descriptions, this should be updated using the cleaned descriptions
UPGRADES_MAP = {
'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
}

View file

@ -20,7 +20,7 @@ fi
# Step 2: Build the Docker image
echo "Building Docker image..."
docker build -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .
docker build --platform linux/amd64 -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .
# Step 3: Run the Docker image with the emulator, .env file, and AWS credentials
echo "Starting the Docker container..."

65
utils/s3.py Normal file
View file

@ -0,0 +1,65 @@
import boto3
from io import BytesIO
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
def read_from_s3(bucket_name, s3_file_name):
"""
Read an object from s3. Decoding of the data is left for outside of this function
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Initialize a session using Amazon S3
s3 = boto3.resource('s3')
# Get the MessagePack data from S3
obj = s3.Object(bucket_name, s3_file_name)
data = obj.get()['Body'].read()
return data
def save_data_to_s3(data, bucket_name, s3_file_name):
"""
Save an object to an S3 bucket
:param data: The data to save
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
try:
s3 = boto3.client('s3')
except NoCredentialsError:
print("Credentials not available.")
return
except PartialCredentialsError:
print("Incomplete credentials provided.")
return
try:
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
except Exception as e:
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
"""
Save a pandas DataFrame to S3 as a Parquet file.
:param df: The pandas DataFrame.
:param bucket_name: Name of the S3 bucket.
:param file_key: Key of the file (including directory path within the bucket).
"""
# Convert the DataFrame to a Parquet format in memory
parquet_buffer = BytesIO()
df.to_parquet(parquet_buffer)
# Create the boto3 client
client = boto3.client('s3')
# Upload the Parquet file to S3
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())