Model/sfr/principal_pitch/0_prepare_sample.py
2025-06-25 20:43:40 +01:00

71 lines
2.6 KiB
Python

"""
This is a script for preparing a sample for testing the end to end process, so that when Spring send us
data, we know it will work.
"""
import pandas as pd
from utils.s3 import read_csv_from_s3
birmingham_epcs = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/domestic-E08000025-Birmingham/certificates.csv"
)
# We get the newest EPC, by UPRN and LODGEMENT_DATE
birmingham_epcs['LODGEMENT_DATE'] = pd.to_datetime(birmingham_epcs['LODGEMENT_DATE'])
birmingham_epcs = birmingham_epcs.sort_values(
by=['UPRN', 'LODGEMENT_DATE'],
ascending=[True, False]
).drop_duplicates(subset='UPRN')
birmingham_epcs["postal_region"] = birmingham_epcs["POSTCODE"].str.split(" ").str[0]
addressable_market = birmingham_epcs[
(birmingham_epcs['CURRENT_ENERGY_RATING'].isin(['F', 'G', 'E', 'D'])) &
(birmingham_epcs['LODGEMENT_DATE'] >= '2020-01-01') &
(birmingham_epcs['PROPERTY_TYPE'].isin(['House', 'Bungalow'])) &
(birmingham_epcs['TENURE'].isin(
['rental (private)', 'Rented (private)']
))
]
# We take the Spring portfolio and remove the properties in their sample
asset_list = read_csv_from_s3(bucket_name="retrofit-plan-inputs-dev", filepath='8/206/asset_list.csv')
asset_list = pd.DataFrame(asset_list)
asset_list["postal_region"] = asset_list["postcode"].str.split(" ").str[0]
addressable_market = addressable_market[
~addressable_market["UPRN"].astype(int).astype(str).isin(asset_list["uprn"].values)
]
addressable_market = addressable_market[
addressable_market["postal_region"].isin(asset_list["postal_region"].unique())
]
# Take a sample of properties, EPC F or G, EPC lodged in 2025. We focus on houses/bingalows
sample = birmingham_epcs[
(birmingham_epcs['CURRENT_ENERGY_RATING'].isin(['F', 'G'])) &
(birmingham_epcs['LODGEMENT_DATE'] >= '2025-01-01') &
(birmingham_epcs['PROPERTY_TYPE'].isin(['House', 'Bungalow']))
]
# Prepare the sample, with just the columns we would expect to receive from Spring
# 1) UPRN
# 2) Address
# 3) Postcode
# 4) Property type
# 5) Built form
# 6) Number of bedrooms (we'll simulate this)
# 7) Number of bathrooms (we'll simulate this)
# 8) Valuation (We'll simulate this, around 200,000)
sample = sample[['UPRN', 'ADDRESS', 'POSTCODE', 'PROPERTY_TYPE', 'BUILT_FORM']].copy()
sample['BEDROOMS'] = 3 # Simulating number of bedrooms
sample['BATHROOMS'] = 1 # Simulating number of bathrooms
sample['VALUATION'] = 200000 # Simulating valuation
sample.columns = [x.lower() for x in sample.columns]
# Store this as a excel
sample.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/birmingham_sample.xlsx",
index=False
)