improving data read code to create standardised matching_address and house number

This commit is contained in:
Khalim Conn-Kowlessar 2024-01-23 18:06:34 +00:00
parent 0620c45a22
commit 9ac6b25b9f
4 changed files with 192 additions and 20 deletions

View file

@ -22,6 +22,8 @@ class PropertyValuation:
100021192109: 650000, # Based on Zoopla
766249482: 358000, # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached
100120703802: 277000, # Based on Zoopla
10014469685: 286000, # Based on Zoopla
10001328782: 196000, # Based on Zoopla
}
# We base our valuation uplifts on a number of sources

View file

@ -1,8 +1,7 @@
import os
import msgpack
import openpyxl
from pathlib import Path
from tqdm import tqdm
import msgpack
from datetime import datetime
import pandas as pd
import numpy as np
@ -48,6 +47,14 @@ class DataLoader:
}
}
MIN_ROWS = {
"ha_1": 2,
"ha_6": 2,
"ha_14": 3, # The spreadsheet starts from the third row
"ha_39": 2,
"ha_107": 2,
}
def __init__(self, files, use_cache):
self.files = files
self.use_cache = use_cache
@ -60,11 +67,14 @@ class DataLoader:
sheet = workbook[sheet_name]
else:
sheet = workbook.active
sheet_colnames = [cell.value for cell in sheet[1]]
sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]]
rows_data = []
rows_colors = []
for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers
for row in tqdm(
sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False)
): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
# row_color = COLOR_INDEX[row_color]
@ -73,8 +83,12 @@ class DataLoader:
asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
asset_list = asset_list.loc[:, asset_list.columns.notnull()]
asset_list['row_color'] = rows_colors
# Remove entirely empty roww - consider all rows apart from row_color
asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
asset_list["row_colour_name"] = np.where(
@ -92,6 +106,54 @@ class DataLoader:
# Add in asset_list_row_id
asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
# Prepare the asset list
# Depending on the HA, we need to rename some columns
if ha_name == "ha_1":
asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip()
elif ha_name == "ha_6":
asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
elif ha_name == "ha_14":
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \
asset_list["Address 2"].str.lower().str.strip() + ", " + \
asset_list["Address 3"].str.lower().str.strip() + ", " + \
asset_list["Address 4"].str.lower().str.strip() + ", " + \
asset_list["Postcode"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
elif ha_name == "ha_39":
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["add_2"].str.lower().str.strip() + ", " + \
asset_list["add_3"].str.lower().str.strip() + ", " + \
asset_list["add_4"].str.lower().str.strip() + ", " + \
asset_list["add_5"].str.lower().str.strip() + ", " + \
asset_list["post_code"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
elif ha_name == "ha_107":
# Create matching_address by concatenating House No, Street, Town, District, Postcode
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Street"].str.lower().str.strip() + ", " + \
asset_list["Town"].str.lower().str.strip() + ", " + \
asset_list["District"].str.lower().str.strip() + ", " + \
asset_list["Postcode"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
else:
raise NotImplementedError("implement me")
if ha_name in ["ha_107"]:
asset_list["HouseNo"] = asset_list["House No"].copy()
else:
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
return asset_list
def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@ -165,22 +227,10 @@ class DataLoader:
def merge_ha_6(asset_list, survey_list):
# Correct the asset list
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
# Prepare the asset list
asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
house_numbers = split_addresses['temp'].str.split(' ', expand=True)
house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
del split_addresses, house_numbers
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way")
# Correct the survey list
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
@ -403,6 +453,30 @@ class DataLoader:
)
def get_epc_data(loader):
if not loader.data:
raise ValueError("Data not found - please run loader.load() first")
property_type_lookup = {}
for ha_name, data_assets in loader.data.items():
# For each HA, we read pull in the data required, and store in S3
asset_list = data_assets["asset_list"]
# We iterate through the asset list and pull what we need
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
searcher = SearchEpc(
address1=property_meta["No."],
postcode=property_meta["Postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=None,
full_address=property_meta["Address"]
)
searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"]
searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"]
searcher.find_property(skip_os=True)
def app():
"""
This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
@ -451,3 +525,23 @@ def app():
loader = DataLoader(files, use_cache)
loader.load()
# TODO: We probably need to make sure that we have all of the columns that we need
# We load in the additional data required to perform the analysis
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)
created_at = datetime.now().isoformat()
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
get_epc_data(loader)

View file

@ -0,0 +1,38 @@
"""
This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
testing
"""
import os
import pandas as pd
from utils.s3 import save_csv_to_s3
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
USER_ID = 8
PORTFOLIO_ID = 61
def app():
pilot_file = pd.DataFrame(
[
{"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None},
{"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None},
]
)
# Store the data in s3
filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv"
save_csv_to_s3(
dataframe=pilot_file,
bucket_name="retrofit-plan-inputs-dev",
file_name=filename
)
body = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Social",
"goal": "Increase EPC",
"goal_value": "C",
"trigger_file_path": filename
}
print(body)

View file

@ -0,0 +1,38 @@
"""
This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
testing
"""
import os
import pandas as pd
from utils.s3 import save_csv_to_s3
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
USER_ID = 8
PORTFOLIO_ID = 59
def app():
pilot_file = pd.DataFrame(
[
{"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None},
{"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None},
]
)
# Store the data in s3
filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv"
save_csv_to_s3(
dataframe=pilot_file,
bucket_name="retrofit-plan-inputs-dev",
file_name=filename
)
body = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Social",
"goal": "Increase EPC",
"goal_value": "C",
"trigger_file_path": filename
}
print(body)