From 9ac6b25b9fa1adf91926109d6a5610d50bee28b8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jan 2024 18:06:34 +0000 Subject: [PATCH] improving data read code to create standardised matching_address and house number --- backend/ml_models/Valuation.py | 2 + .../ha_15_32/ha_analysis_batch_3.py | 134 +++++++++++++++--- etl/testing_data/livewest_pilot.py | 38 +++++ .../the_guiness_partnership_pilot.py | 38 +++++ 4 files changed, 192 insertions(+), 20 deletions(-) create mode 100644 etl/testing_data/livewest_pilot.py create mode 100644 etl/testing_data/the_guiness_partnership_pilot.py diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index dadef9a9..ff771252 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -22,6 +22,8 @@ class PropertyValuation: 100021192109: 650000, # Based on Zoopla 766249482: 358000, # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached 100120703802: 277000, # Based on Zoopla + 10014469685: 286000, # Based on Zoopla + 10001328782: 196000, # Based on Zoopla } # We base our valuation uplifts on a number of sources diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 85f8704d..54cd7c58 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,8 +1,7 @@ import os -import msgpack import openpyxl from pathlib import Path -from tqdm import tqdm +import msgpack from datetime import datetime import pandas as pd import numpy as np @@ -48,6 +47,14 @@ class DataLoader: } } + MIN_ROWS = { + "ha_1": 2, + "ha_6": 2, + "ha_14": 3, # The spreadsheet starts from the third row + "ha_39": 2, + "ha_107": 2, + } + def __init__(self, files, use_cache): self.files = files self.use_cache = use_cache @@ -60,11 +67,14 @@ class DataLoader: sheet = workbook[sheet_name] else: sheet = workbook.active - sheet_colnames = [cell.value for cell in sheet[1]] + sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]] rows_data = [] rows_colors = [] - for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers + for row in tqdm( + sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False) + ): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None # row_color = COLOR_INDEX[row_color] @@ -73,8 +83,12 @@ class DataLoader: asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) asset_list = asset_list.loc[:, asset_list.columns.notnull()] + asset_list['row_color'] = rows_colors + # Remove entirely empty roww - consider all rows apart from row_color + asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)] + asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] asset_list["row_colour_name"] = np.where( @@ -92,6 +106,54 @@ class DataLoader: # Add in asset_list_row_id asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))] + # Prepare the asset list + # Depending on the HA, we need to rename some columns + if ha_name == "ha_1": + asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip() + elif ha_name == "ha_6": + asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip() + elif ha_name == "ha_14": + # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode + asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \ + asset_list["Address 2"].str.lower().str.strip() + ", " + \ + asset_list["Address 3"].str.lower().str.strip() + ", " + \ + asset_list["Address 4"].str.lower().str.strip() + ", " + \ + asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + elif ha_name == "ha_39": + # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code + asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["add_2"].str.lower().str.strip() + ", " + \ + asset_list["add_3"].str.lower().str.strip() + ", " + \ + asset_list["add_4"].str.lower().str.strip() + ", " + \ + asset_list["add_5"].str.lower().str.strip() + ", " + \ + asset_list["post_code"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip() + elif ha_name == "ha_107": + # Create matching_address by concatenating House No, Street, Town, District, Postcode + asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Street"].str.lower().str.strip() + ", " + \ + asset_list["Town"].str.lower().str.strip() + ", " + \ + asset_list["District"].str.lower().str.strip() + ", " + \ + asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + else: + raise NotImplementedError("implement me") + + if ha_name in ["ha_107"]: + asset_list["HouseNo"] = asset_list["House No"].copy() + else: + split_addresses = asset_list['matching_address'].str.split(',', expand=True) + house_numbers = split_addresses[0].str.split(' ', expand=True) + # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how + # many columns there might be + house_numbers = house_numbers.iloc[:, 0:1] + house_numbers.columns = ['HouseNo'] + + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + return asset_list def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None): @@ -165,22 +227,10 @@ class DataLoader: def merge_ha_6(asset_list, survey_list): # Correct the asset list - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way") - - # Prepare the asset list - asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip() - - split_addresses = asset_list['matching_address'].str.split(',', expand=True) - split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] - house_numbers = split_addresses['temp'].str.split(' ', expand=True) - house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"] - - asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) - del split_addresses, house_numbers + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way") # Correct the survey list survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( @@ -403,6 +453,30 @@ class DataLoader: ) +def get_epc_data(loader): + if not loader.data: + raise ValueError("Data not found - please run loader.load() first") + + property_type_lookup = {} + + for ha_name, data_assets in loader.data.items(): + # For each HA, we read pull in the data required, and store in S3 + asset_list = data_assets["asset_list"] + + # We iterate through the asset list and pull what we need + for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + searcher = SearchEpc( + address1=property_meta["No."], + postcode=property_meta["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=property_meta["Address"] + ) + searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"] + searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"] + searcher.find_property(skip_os=True) + + def app(): """ This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107. @@ -451,3 +525,23 @@ def app(): loader = DataLoader(files, use_cache) loader.load() + + # TODO: We probably need to make sure that we have all of the columns that we need + + # We load in the additional data required to perform the analysis + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + get_epc_data(loader) diff --git a/etl/testing_data/livewest_pilot.py b/etl/testing_data/livewest_pilot.py new file mode 100644 index 00000000..580c16d0 --- /dev/null +++ b/etl/testing_data/livewest_pilot.py @@ -0,0 +1,38 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import os + +import pandas as pd +from utils.s3 import save_csv_to_s3 + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) +USER_ID = 8 +PORTFOLIO_ID = 61 + + +def app(): + pilot_file = pd.DataFrame( + [ + {"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None}, + {"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv" + save_csv_to_s3( + dataframe=pilot_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "C", + "trigger_file_path": filename + } + print(body) diff --git a/etl/testing_data/the_guiness_partnership_pilot.py b/etl/testing_data/the_guiness_partnership_pilot.py new file mode 100644 index 00000000..496ea7ea --- /dev/null +++ b/etl/testing_data/the_guiness_partnership_pilot.py @@ -0,0 +1,38 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import os + +import pandas as pd +from utils.s3 import save_csv_to_s3 + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) +USER_ID = 8 +PORTFOLIO_ID = 59 + + +def app(): + pilot_file = pd.DataFrame( + [ + {"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None}, + {"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv" + save_csv_to_s3( + dataframe=pilot_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "C", + "trigger_file_path": filename + } + print(body)