mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
improving data read code to create standardised matching_address and house number
This commit is contained in:
parent
0620c45a22
commit
9ac6b25b9f
4 changed files with 192 additions and 20 deletions
|
|
@ -22,6 +22,8 @@ class PropertyValuation:
|
|||
100021192109: 650000, # Based on Zoopla
|
||||
766249482: 358000, # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached
|
||||
100120703802: 277000, # Based on Zoopla
|
||||
10014469685: 286000, # Based on Zoopla
|
||||
10001328782: 196000, # Based on Zoopla
|
||||
}
|
||||
|
||||
# We base our valuation uplifts on a number of sources
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
import os
|
||||
import msgpack
|
||||
import openpyxl
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import msgpack
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
|
@ -48,6 +47,14 @@ class DataLoader:
|
|||
}
|
||||
}
|
||||
|
||||
MIN_ROWS = {
|
||||
"ha_1": 2,
|
||||
"ha_6": 2,
|
||||
"ha_14": 3, # The spreadsheet starts from the third row
|
||||
"ha_39": 2,
|
||||
"ha_107": 2,
|
||||
}
|
||||
|
||||
def __init__(self, files, use_cache):
|
||||
self.files = files
|
||||
self.use_cache = use_cache
|
||||
|
|
@ -60,11 +67,14 @@ class DataLoader:
|
|||
sheet = workbook[sheet_name]
|
||||
else:
|
||||
sheet = workbook.active
|
||||
sheet_colnames = [cell.value for cell in sheet[1]]
|
||||
sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]]
|
||||
|
||||
rows_data = []
|
||||
rows_colors = []
|
||||
for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers
|
||||
for row in tqdm(
|
||||
sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False)
|
||||
): # Assuming the first row is headers
|
||||
|
||||
row_data = [cell.value for cell in row] # This will get you the cell values
|
||||
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
|
||||
# row_color = COLOR_INDEX[row_color]
|
||||
|
|
@ -73,8 +83,12 @@ class DataLoader:
|
|||
|
||||
asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
|
||||
asset_list = asset_list.loc[:, asset_list.columns.notnull()]
|
||||
|
||||
asset_list['row_color'] = rows_colors
|
||||
|
||||
# Remove entirely empty roww - consider all rows apart from row_color
|
||||
asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
|
||||
|
||||
asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
|
||||
|
||||
asset_list["row_colour_name"] = np.where(
|
||||
|
|
@ -92,6 +106,54 @@ class DataLoader:
|
|||
# Add in asset_list_row_id
|
||||
asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
|
||||
|
||||
# Prepare the asset list
|
||||
# Depending on the HA, we need to rename some columns
|
||||
if ha_name == "ha_1":
|
||||
asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip()
|
||||
elif ha_name == "ha_6":
|
||||
asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
|
||||
elif ha_name == "ha_14":
|
||||
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
||||
asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Address 2"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Address 3"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Address 4"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Postcode"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
|
||||
elif ha_name == "ha_39":
|
||||
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
|
||||
asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["add_2"].str.lower().str.strip() + ", " + \
|
||||
asset_list["add_3"].str.lower().str.strip() + ", " + \
|
||||
asset_list["add_4"].str.lower().str.strip() + ", " + \
|
||||
asset_list["add_5"].str.lower().str.strip() + ", " + \
|
||||
asset_list["post_code"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
|
||||
elif ha_name == "ha_107":
|
||||
# Create matching_address by concatenating House No, Street, Town, District, Postcode
|
||||
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["Street"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Town"].str.lower().str.strip() + ", " + \
|
||||
asset_list["District"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Postcode"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
|
||||
else:
|
||||
raise NotImplementedError("implement me")
|
||||
|
||||
if ha_name in ["ha_107"]:
|
||||
asset_list["HouseNo"] = asset_list["House No"].copy()
|
||||
else:
|
||||
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
|
||||
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
||||
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
||||
# many columns there might be
|
||||
house_numbers = house_numbers.iloc[:, 0:1]
|
||||
house_numbers.columns = ['HouseNo']
|
||||
|
||||
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
|
||||
|
||||
return asset_list
|
||||
|
||||
def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
|
||||
|
|
@ -165,22 +227,10 @@ class DataLoader:
|
|||
def merge_ha_6(asset_list, survey_list):
|
||||
|
||||
# Correct the asset list
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
|
||||
|
||||
# Prepare the asset list
|
||||
asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
|
||||
|
||||
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
|
||||
split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
|
||||
house_numbers = split_addresses['temp'].str.split(' ', expand=True)
|
||||
house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
|
||||
|
||||
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
|
||||
del split_addresses, house_numbers
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close")
|
||||
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way")
|
||||
|
||||
# Correct the survey list
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
|
|
@ -403,6 +453,30 @@ class DataLoader:
|
|||
)
|
||||
|
||||
|
||||
def get_epc_data(loader):
|
||||
if not loader.data:
|
||||
raise ValueError("Data not found - please run loader.load() first")
|
||||
|
||||
property_type_lookup = {}
|
||||
|
||||
for ha_name, data_assets in loader.data.items():
|
||||
# For each HA, we read pull in the data required, and store in S3
|
||||
asset_list = data_assets["asset_list"]
|
||||
|
||||
# We iterate through the asset list and pull what we need
|
||||
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
searcher = SearchEpc(
|
||||
address1=property_meta["No."],
|
||||
postcode=property_meta["Postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key=None,
|
||||
full_address=property_meta["Address"]
|
||||
)
|
||||
searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"]
|
||||
searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"]
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
|
||||
|
|
@ -451,3 +525,23 @@ def app():
|
|||
|
||||
loader = DataLoader(files, use_cache)
|
||||
loader.load()
|
||||
|
||||
# TODO: We probably need to make sure that we have all of the columns that we need
|
||||
|
||||
# We load in the additional data required to perform the analysis
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
cleaning_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
created_at = datetime.now().isoformat()
|
||||
|
||||
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
|
||||
|
||||
get_epc_data(loader)
|
||||
|
|
|
|||
38
etl/testing_data/livewest_pilot.py
Normal file
38
etl/testing_data/livewest_pilot.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
|
||||
testing
|
||||
"""
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from utils.s3 import save_csv_to_s3
|
||||
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
|
||||
USER_ID = 8
|
||||
PORTFOLIO_ID = 61
|
||||
|
||||
|
||||
def app():
|
||||
pilot_file = pd.DataFrame(
|
||||
[
|
||||
{"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None},
|
||||
{"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None},
|
||||
]
|
||||
)
|
||||
|
||||
# Store the data in s3
|
||||
filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv"
|
||||
save_csv_to_s3(
|
||||
dataframe=pilot_file,
|
||||
bucket_name="retrofit-plan-inputs-dev",
|
||||
file_name=filename
|
||||
)
|
||||
|
||||
body = {
|
||||
"portfolio_id": str(PORTFOLIO_ID),
|
||||
"housing_type": "Social",
|
||||
"goal": "Increase EPC",
|
||||
"goal_value": "C",
|
||||
"trigger_file_path": filename
|
||||
}
|
||||
print(body)
|
||||
38
etl/testing_data/the_guiness_partnership_pilot.py
Normal file
38
etl/testing_data/the_guiness_partnership_pilot.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
|
||||
testing
|
||||
"""
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from utils.s3 import save_csv_to_s3
|
||||
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
|
||||
USER_ID = 8
|
||||
PORTFOLIO_ID = 59
|
||||
|
||||
|
||||
def app():
|
||||
pilot_file = pd.DataFrame(
|
||||
[
|
||||
{"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None},
|
||||
{"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None},
|
||||
]
|
||||
)
|
||||
|
||||
# Store the data in s3
|
||||
filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv"
|
||||
save_csv_to_s3(
|
||||
dataframe=pilot_file,
|
||||
bucket_name="retrofit-plan-inputs-dev",
|
||||
file_name=filename
|
||||
)
|
||||
|
||||
body = {
|
||||
"portfolio_id": str(PORTFOLIO_ID),
|
||||
"housing_type": "Social",
|
||||
"goal": "Increase EPC",
|
||||
"goal_value": "C",
|
||||
"trigger_file_path": filename
|
||||
}
|
||||
print(body)
|
||||
Loading…
Add table
Reference in a new issue