sfr investiation in progress

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-01 12:21:10 +01:00
parent e58c165a63
commit 39bc6c53b8

View file

@ -75,10 +75,15 @@ def find_f_g_properties(paths):
epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)
# Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], format='mixed', errors="coerce")
if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
raise Exception("wtf")
epc_data = epc_data.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
# Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
epc_data = epc_data.sort_values(
["LODGEMENT_DATE", "LODGEMENT_DATETIME"], ascending=False
).drop_duplicates("UPRN")
# Get G & F properties
epc_data = epc_data[epc_data["CURRENT_ENERGY_RATING"].isin(["G", "F"])]
@ -401,6 +406,8 @@ def app():
~company_ownership["Property Address"].str.lower().str.startswith(starting_term)
]
# address = properties[properties["UPRN"] == 100030253055].squeeze()
freehold_matching_lookup = [] # 634
leasehold_matching_lookup = [] # 86
shared_leasehold_match = []
@ -493,12 +500,18 @@ def app():
# freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
# leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
# freehold_matching_lookup.shape
# (1537, 4)
# leasehold_matching_lookup.shape
# (390, 4)
# The approximate matches aren't very good
freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"]
leasehold_matching_lookup = leasehold_matching_lookup[leasehold_matching_lookup["match_type"] == "exact"]
# Combine
combined_matching_lookup = pd.concat([freehold_matching_lookup, leasehold_matching_lookup])
# Remove duplicates
combined_matching_lookup = remove_duplicate_matches(
matching_lookup=combined_matching_lookup, properties=properties, company_ownership=company_ownership
@ -566,7 +579,6 @@ def app():
land_registry_matches = []
for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
# Filter land registry on the postcode
lr_filtered = land_registry[
(land_registry["postcode"] == match["epc_postcode"].lower().strip())
@ -782,7 +794,7 @@ def app():
right_on="uprn"
).drop(columns=["uprn"])
# Flat anything that sold in the last year
# Flag anything that sold in the last year
matched_addresses["sold_recently"] = (
matched_addresses["date_of_transfer"] >= pd.Timestamp.now() - pd.DateOffset(years=1)
)
@ -792,6 +804,9 @@ def app():
(matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
)
# Save this
# matched_addresses.to_excel("combined_aggregate - pre filter 28th July.xlsx", index=False)
# Drop rows on the booleans
matched_addresses = matched_addresses[
~matched_addresses["sold_recently"] &
@ -835,7 +850,7 @@ def app():
# investment_50m_properties.to_excel("investment_50m_properties 28th July.xlsx", index=False)
# Store the EPC data
# portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 29th July.xlsx", index=False)
# portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 28th July.xlsx", index=False)
# We check if any of these properties are in a conservation area
valuations = pd.read_excel("property value.xlsx")
@ -997,3 +1012,230 @@ def prepare_anonymised_data():
)
df.to_excel("Property List - 50% redacted.xlsx", index=False)
def adhoc_change_of_portfolio_analysis_july_2024():
"""
This is just some adhoc analysis, which answers some questions which arose upon refreshing the SFR portfolio
in late July 2024
:return:
"""
# Question 1: Which properties in the previous portfolio were in conservation areas or had listed/heritage status?
def answer_q1():
# Data was just stored here:
geospatial_data = pd.read_excel("geospatial_data.xlsx")
special_buildings = geospatial_data[
(geospatial_data["conservation_status"] == 1) |
geospatial_data["is_listed_building"] |
geospatial_data["is_heritage_building"]
]
print(
f"There were {special_buildings.shape[0]} properties in the previous portfolio which were in conservation "
f"areas or had listed/heritage status"
)
print(f"{(special_buildings['conservation_status'] == 1).sum()} were in a conservation area")
print(f"{special_buildings['is_listed_building'].sum()} were listed buildings")
print(f"{special_buildings['is_heritage_building'].sum()} were heritage buildings")
answer_q1()
# Question 2: For each property in the old portfolio, why was it lost?
def answer_q2():
# We read in the previous 50m portfolio
previous_portfolio = pd.read_excel("investment_50m_properties 28th May.xlsx") # 39 owners
new_matched_addresses = pd.read_excel("combined_aggregate - pre filter 28th July.xlsx")
new_portfolio = pd.read_excel("investment_50m_properties 28th July.xlsx") # 69 owners
# dropped units
dropped_units = previous_portfolio[
~previous_portfolio["UPRN"].isin(new_portfolio["UPRN"].values)
]
# Lots of properties are missed out - why
# 1) What was dropped, but was in the matched addresses and therefore was maybe filtered out
dropped_units_matched = dropped_units[
dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
].copy()
dropped_units_matched = dropped_units_matched.merge(
new_matched_addresses[
["UPRN", 'transaction_id', 'price', 'date_of_transfer', 'sold_recently', 'sale_lodged_recently']
],
how="left", on="UPRN"
)
# 97 units here - how mant were sold
of_which_sold = dropped_units_matched[
dropped_units_matched["sold_recently"]
]
n_sold = of_which_sold.shape[0]
print(f"{n_sold} sold recently ({n_sold / previous_portfolio.shape[0] * 100})%")
of_which_have_sale_epc_but_not_sold = dropped_units_matched[
~dropped_units_matched["sold_recently"] & dropped_units_matched["sale_lodged_recently"]
]
n_with_sale_epc_but_not_yet_sold = of_which_have_sale_epc_but_not_sold.shape[0]
print(
f"{n_with_sale_epc_but_not_yet_sold} have a sale EPC but have not sold yet ("
f"{n_with_sale_epc_but_not_yet_sold / previous_portfolio.shape[0] * 100})%"
)
# What about things that haven't sold or don't look likely to sell
not_sold = dropped_units_matched[
~dropped_units_matched["sold_recently"] & ~dropped_units_matched["sale_lodged_recently"]
]
new_owner_sizes = new_portfolio.groupby(
["Company Registration No. (1)"]
).size().reset_index().rename(columns={0: "Number of Properties"})
new_owner_sizes = new_owner_sizes.sort_values("Number of Properties", ascending=False)
previous_owner_sizes = previous_portfolio.groupby(
["Company Registration No. (1)"]
).size().reset_index().rename(columns={0: "Number of Properties"})
previous_owner_sizes = previous_owner_sizes.sort_values("Number of Properties", ascending=False)
# Let's just confirm that we took in a bigger owner, as we see this unit was still matched
owner_too_small = []
owner_big_enough = []
for _, property in not_sold.iterrows():
owner_reg_id = property["Company Registration No. (1)"]
old_portfolio_owner_size = previous_owner_sizes[
previous_owner_sizes["Company Registration No. (1)"] == owner_reg_id
]
# We make sure that the number of properties is smaller than the new smallest number
if (
old_portfolio_owner_size["Number of Properties"].values[0] >
new_owner_sizes["Number of Properties"].min()
):
owner_big_enough.append(property.to_dict())
continue
owner_too_small.append(property.to_dict())
n_owner_too_small = len(owner_too_small)
owner_big_enough = pd.DataFrame(owner_big_enough)
summary = []
for _, record in owner_big_enough.iterrows():
# Do we have this new owner?
new_owner = new_portfolio[
new_portfolio["Company Registration No. (1)"] == record["Company Registration No. (1)"]
]
if new_owner.empty:
# Why don't we have this new owner
new_owner_data = new_matched_addresses[
new_matched_addresses["Company Registration No. (1)"] == record["Company Registration No. (1)"]
]
new_owner_data_filtered = new_owner_data[
~new_owner_data["sold_recently"] & ~new_owner_data["sale_lodged_recently"]
]
summary.append(
{
"Owner Name": record["Proprietor Name (1)"],
"Owner reg id": record["Company Registration No. (1)"],
"N properties in new portfolio before filtering": new_owner_data.shape[0],
"N properties in new portfolio after filtering": new_owner_data_filtered.shape[0],
}
)
continue
raise Exception("something went wrong")
summary = pd.DataFrame(summary)
not_accounted_for = summary[
(
summary["N properties in new portfolio before filtering"] <
previous_owner_sizes["Number of Properties"].min()
)
]
# We have two owners not accounted for:
# ALLMID LIMITED, 01959058
# CORAL RACING LIMITED, 541600
# What happened to these owners?
new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
allmid = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "01959058"].copy()
# Check if any of the properties are not in the new EPC data
allmid["not_in_new_epc"] = ~allmid["UPRN"].isin(new_epc["UPRN"])
allmid["not_in_matched_pre_filtered"] = ~allmid["UPRN"].isin(new_matched_addresses["UPRN"])
# In the previous portfolio, Allmid had 4 properties and in the re-build, it has just 2. Why?
# Firstly, one of their properties was re-surveyed not at an F/G
# Secondly, one of their properties is no longer owned by them:
# https://www.zoopla.co.uk/property/uprn/100070553074/
# So as an owner, they fell out of the ranking
coral_racing = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "541600"].copy()
coral_racing["not_in_new_epc"] = ~coral_racing["UPRN"].isin(new_epc["UPRN"])
coral_racing["not_in_matched_pre_filtered"] = ~coral_racing["UPRN"].isin(new_matched_addresses["UPRN"])
# Coral goes down from 4 -> 1 on refresh, so what happened?
# 1) 2 properties had new EPCs and re-scored higher
# 2) 1 property, 85A Market Street, Church Gresley, Swadlincote, DE11 9PN is no longer matched to the ownership
# data, which is correct
# Why were these units lost?
# There's just 1 owner, who is BARHAM PROPERTY LTD
owner_too_big_ids = owner_big_enough["Company Registration No. (1)"].unique()
owner_too_big_names = owner_big_enough["Proprietor Name (1)"].unique()
previous_owner_size = previous_owner_sizes[
previous_owner_sizes["Company Registration No. (1)"].isin(owner_too_big_ids)
]
new_owner_size = new_matched_addresses[
new_matched_addresses["Company Registration No. (1)"].isin(owner_too_big_ids) |
new_matched_addresses["Proprietor Name (1)"].isin(owner_too_big_names)
]
n_unsold = new_owner_size[~new_owner_size["sold_recently"] & ~new_owner_size["sale_lodged_recently"]].shape
# Happy with the justification to this point
assert (
(n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) ==
dropped_units_matched.shape[0]
)
# We now have a list of properties that were lost from the previous iteration to the next that were not matched
dropped_units_unmatched = dropped_units[
~dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
].copy()
# A few possibilities: They aren't in the EPC data?
new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
unmatched_not_in_epc = dropped_units_unmatched[
~dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
]
# There are 17 units that have had new EPCs above a G
# Who were the owners? - various, nothing particularly remarkable
(
previous_portfolio[
previous_portfolio["UPRN"].isin(unmatched_not_in_epc["UPRN"])
]["Proprietor Name (1)"].value_counts()
)
# 22 final units to be accounted for...!
unmatched_in_epc = dropped_units_unmatched[
dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
]
# Some of them will be due to ownership
# TODO: Read in freehold/leashold data and see how many of these were non-exact matches!
leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
combined_matching_lookup = pd.concat([leasehold_matching_lookup, freehold_matching_lookup])
# THis is 13 matches, all of them approximate
weak_matches = unmatched_in_epc.merge(combined_matching_lookup, how="inner", on="UPRN")
# These have been lost due to ownership updates. This has been checked manually for every unit and there has
# been sale activity for each one, justifying the change in ownership data
remaining_matches = unmatched_in_epc[
~unmatched_in_epc["UPRN"].isin(weak_matches["UPRN"])
]
assert dropped_units.shape[0] == (
(n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) + len(
weak_matches) + unmatched_not_in_epc.shape[0]
)