mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
sfr investiation in progress
This commit is contained in:
parent
e58c165a63
commit
39bc6c53b8
1 changed files with 248 additions and 6 deletions
|
|
@ -75,10 +75,15 @@ def find_f_g_properties(paths):
|
|||
epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
|
||||
epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)
|
||||
|
||||
# Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
|
||||
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], format='mixed', errors="coerce")
|
||||
if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
|
||||
raise Exception("wtf")
|
||||
|
||||
epc_data = epc_data.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
|
||||
# Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
|
||||
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
|
||||
|
||||
epc_data = epc_data.sort_values(
|
||||
["LODGEMENT_DATE", "LODGEMENT_DATETIME"], ascending=False
|
||||
).drop_duplicates("UPRN")
|
||||
|
||||
# Get G & F properties
|
||||
epc_data = epc_data[epc_data["CURRENT_ENERGY_RATING"].isin(["G", "F"])]
|
||||
|
|
@ -401,6 +406,8 @@ def app():
|
|||
~company_ownership["Property Address"].str.lower().str.startswith(starting_term)
|
||||
]
|
||||
|
||||
# address = properties[properties["UPRN"] == 100030253055].squeeze()
|
||||
|
||||
freehold_matching_lookup = [] # 634
|
||||
leasehold_matching_lookup = [] # 86
|
||||
shared_leasehold_match = []
|
||||
|
|
@ -493,12 +500,18 @@ def app():
|
|||
# freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
|
||||
# leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
|
||||
|
||||
# freehold_matching_lookup.shape
|
||||
# (1537, 4)
|
||||
# leasehold_matching_lookup.shape
|
||||
# (390, 4)
|
||||
|
||||
# The approximate matches aren't very good
|
||||
freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"]
|
||||
leasehold_matching_lookup = leasehold_matching_lookup[leasehold_matching_lookup["match_type"] == "exact"]
|
||||
|
||||
# Combine
|
||||
combined_matching_lookup = pd.concat([freehold_matching_lookup, leasehold_matching_lookup])
|
||||
|
||||
# Remove duplicates
|
||||
combined_matching_lookup = remove_duplicate_matches(
|
||||
matching_lookup=combined_matching_lookup, properties=properties, company_ownership=company_ownership
|
||||
|
|
@ -566,7 +579,6 @@ def app():
|
|||
|
||||
land_registry_matches = []
|
||||
for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
|
||||
|
||||
# Filter land registry on the postcode
|
||||
lr_filtered = land_registry[
|
||||
(land_registry["postcode"] == match["epc_postcode"].lower().strip())
|
||||
|
|
@ -782,7 +794,7 @@ def app():
|
|||
right_on="uprn"
|
||||
).drop(columns=["uprn"])
|
||||
|
||||
# Flat anything that sold in the last year
|
||||
# Flag anything that sold in the last year
|
||||
matched_addresses["sold_recently"] = (
|
||||
matched_addresses["date_of_transfer"] >= pd.Timestamp.now() - pd.DateOffset(years=1)
|
||||
)
|
||||
|
|
@ -792,6 +804,9 @@ def app():
|
|||
(matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
|
||||
)
|
||||
|
||||
# Save this
|
||||
# matched_addresses.to_excel("combined_aggregate - pre filter 28th July.xlsx", index=False)
|
||||
|
||||
# Drop rows on the booleans
|
||||
matched_addresses = matched_addresses[
|
||||
~matched_addresses["sold_recently"] &
|
||||
|
|
@ -835,7 +850,7 @@ def app():
|
|||
# investment_50m_properties.to_excel("investment_50m_properties 28th July.xlsx", index=False)
|
||||
|
||||
# Store the EPC data
|
||||
# portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 29th July.xlsx", index=False)
|
||||
# portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 28th July.xlsx", index=False)
|
||||
|
||||
# We check if any of these properties are in a conservation area
|
||||
valuations = pd.read_excel("property value.xlsx")
|
||||
|
|
@ -997,3 +1012,230 @@ def prepare_anonymised_data():
|
|||
)
|
||||
|
||||
df.to_excel("Property List - 50% redacted.xlsx", index=False)
|
||||
|
||||
|
||||
def adhoc_change_of_portfolio_analysis_july_2024():
|
||||
"""
|
||||
This is just some adhoc analysis, which answers some questions which arose upon refreshing the SFR portfolio
|
||||
in late July 2024
|
||||
:return:
|
||||
"""
|
||||
|
||||
# Question 1: Which properties in the previous portfolio were in conservation areas or had listed/heritage status?
|
||||
def answer_q1():
|
||||
# Data was just stored here:
|
||||
geospatial_data = pd.read_excel("geospatial_data.xlsx")
|
||||
|
||||
special_buildings = geospatial_data[
|
||||
(geospatial_data["conservation_status"] == 1) |
|
||||
geospatial_data["is_listed_building"] |
|
||||
geospatial_data["is_heritage_building"]
|
||||
]
|
||||
|
||||
print(
|
||||
f"There were {special_buildings.shape[0]} properties in the previous portfolio which were in conservation "
|
||||
f"areas or had listed/heritage status"
|
||||
)
|
||||
print(f"{(special_buildings['conservation_status'] == 1).sum()} were in a conservation area")
|
||||
print(f"{special_buildings['is_listed_building'].sum()} were listed buildings")
|
||||
print(f"{special_buildings['is_heritage_building'].sum()} were heritage buildings")
|
||||
|
||||
answer_q1()
|
||||
|
||||
# Question 2: For each property in the old portfolio, why was it lost?
|
||||
def answer_q2():
|
||||
# We read in the previous 50m portfolio
|
||||
previous_portfolio = pd.read_excel("investment_50m_properties 28th May.xlsx") # 39 owners
|
||||
|
||||
new_matched_addresses = pd.read_excel("combined_aggregate - pre filter 28th July.xlsx")
|
||||
new_portfolio = pd.read_excel("investment_50m_properties 28th July.xlsx") # 69 owners
|
||||
|
||||
# dropped units
|
||||
dropped_units = previous_portfolio[
|
||||
~previous_portfolio["UPRN"].isin(new_portfolio["UPRN"].values)
|
||||
]
|
||||
# Lots of properties are missed out - why
|
||||
# 1) What was dropped, but was in the matched addresses and therefore was maybe filtered out
|
||||
dropped_units_matched = dropped_units[
|
||||
dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
|
||||
].copy()
|
||||
|
||||
dropped_units_matched = dropped_units_matched.merge(
|
||||
new_matched_addresses[
|
||||
["UPRN", 'transaction_id', 'price', 'date_of_transfer', 'sold_recently', 'sale_lodged_recently']
|
||||
],
|
||||
how="left", on="UPRN"
|
||||
)
|
||||
|
||||
# 97 units here - how mant were sold
|
||||
of_which_sold = dropped_units_matched[
|
||||
dropped_units_matched["sold_recently"]
|
||||
]
|
||||
n_sold = of_which_sold.shape[0]
|
||||
print(f"{n_sold} sold recently ({n_sold / previous_portfolio.shape[0] * 100})%")
|
||||
|
||||
of_which_have_sale_epc_but_not_sold = dropped_units_matched[
|
||||
~dropped_units_matched["sold_recently"] & dropped_units_matched["sale_lodged_recently"]
|
||||
]
|
||||
n_with_sale_epc_but_not_yet_sold = of_which_have_sale_epc_but_not_sold.shape[0]
|
||||
print(
|
||||
f"{n_with_sale_epc_but_not_yet_sold} have a sale EPC but have not sold yet ("
|
||||
f"{n_with_sale_epc_but_not_yet_sold / previous_portfolio.shape[0] * 100})%"
|
||||
)
|
||||
|
||||
# What about things that haven't sold or don't look likely to sell
|
||||
not_sold = dropped_units_matched[
|
||||
~dropped_units_matched["sold_recently"] & ~dropped_units_matched["sale_lodged_recently"]
|
||||
]
|
||||
|
||||
new_owner_sizes = new_portfolio.groupby(
|
||||
["Company Registration No. (1)"]
|
||||
).size().reset_index().rename(columns={0: "Number of Properties"})
|
||||
new_owner_sizes = new_owner_sizes.sort_values("Number of Properties", ascending=False)
|
||||
|
||||
previous_owner_sizes = previous_portfolio.groupby(
|
||||
["Company Registration No. (1)"]
|
||||
).size().reset_index().rename(columns={0: "Number of Properties"})
|
||||
previous_owner_sizes = previous_owner_sizes.sort_values("Number of Properties", ascending=False)
|
||||
|
||||
# Let's just confirm that we took in a bigger owner, as we see this unit was still matched
|
||||
owner_too_small = []
|
||||
owner_big_enough = []
|
||||
for _, property in not_sold.iterrows():
|
||||
owner_reg_id = property["Company Registration No. (1)"]
|
||||
old_portfolio_owner_size = previous_owner_sizes[
|
||||
previous_owner_sizes["Company Registration No. (1)"] == owner_reg_id
|
||||
]
|
||||
# We make sure that the number of properties is smaller than the new smallest number
|
||||
if (
|
||||
old_portfolio_owner_size["Number of Properties"].values[0] >
|
||||
new_owner_sizes["Number of Properties"].min()
|
||||
):
|
||||
owner_big_enough.append(property.to_dict())
|
||||
continue
|
||||
|
||||
owner_too_small.append(property.to_dict())
|
||||
|
||||
n_owner_too_small = len(owner_too_small)
|
||||
owner_big_enough = pd.DataFrame(owner_big_enough)
|
||||
|
||||
summary = []
|
||||
for _, record in owner_big_enough.iterrows():
|
||||
# Do we have this new owner?
|
||||
new_owner = new_portfolio[
|
||||
new_portfolio["Company Registration No. (1)"] == record["Company Registration No. (1)"]
|
||||
]
|
||||
if new_owner.empty:
|
||||
# Why don't we have this new owner
|
||||
new_owner_data = new_matched_addresses[
|
||||
new_matched_addresses["Company Registration No. (1)"] == record["Company Registration No. (1)"]
|
||||
]
|
||||
|
||||
new_owner_data_filtered = new_owner_data[
|
||||
~new_owner_data["sold_recently"] & ~new_owner_data["sale_lodged_recently"]
|
||||
]
|
||||
|
||||
summary.append(
|
||||
{
|
||||
"Owner Name": record["Proprietor Name (1)"],
|
||||
"Owner reg id": record["Company Registration No. (1)"],
|
||||
"N properties in new portfolio before filtering": new_owner_data.shape[0],
|
||||
"N properties in new portfolio after filtering": new_owner_data_filtered.shape[0],
|
||||
}
|
||||
|
||||
)
|
||||
continue
|
||||
raise Exception("something went wrong")
|
||||
|
||||
summary = pd.DataFrame(summary)
|
||||
|
||||
not_accounted_for = summary[
|
||||
(
|
||||
summary["N properties in new portfolio before filtering"] <
|
||||
previous_owner_sizes["Number of Properties"].min()
|
||||
)
|
||||
]
|
||||
|
||||
# We have two owners not accounted for:
|
||||
# ALLMID LIMITED, 01959058
|
||||
# CORAL RACING LIMITED, 541600
|
||||
# What happened to these owners?
|
||||
new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
|
||||
allmid = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "01959058"].copy()
|
||||
# Check if any of the properties are not in the new EPC data
|
||||
allmid["not_in_new_epc"] = ~allmid["UPRN"].isin(new_epc["UPRN"])
|
||||
allmid["not_in_matched_pre_filtered"] = ~allmid["UPRN"].isin(new_matched_addresses["UPRN"])
|
||||
# In the previous portfolio, Allmid had 4 properties and in the re-build, it has just 2. Why?
|
||||
# Firstly, one of their properties was re-surveyed not at an F/G
|
||||
# Secondly, one of their properties is no longer owned by them:
|
||||
# https://www.zoopla.co.uk/property/uprn/100070553074/
|
||||
# So as an owner, they fell out of the ranking
|
||||
coral_racing = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "541600"].copy()
|
||||
coral_racing["not_in_new_epc"] = ~coral_racing["UPRN"].isin(new_epc["UPRN"])
|
||||
coral_racing["not_in_matched_pre_filtered"] = ~coral_racing["UPRN"].isin(new_matched_addresses["UPRN"])
|
||||
# Coral goes down from 4 -> 1 on refresh, so what happened?
|
||||
# 1) 2 properties had new EPCs and re-scored higher
|
||||
# 2) 1 property, 85A Market Street, Church Gresley, Swadlincote, DE11 9PN is no longer matched to the ownership
|
||||
# data, which is correct
|
||||
|
||||
# Why were these units lost?
|
||||
# There's just 1 owner, who is BARHAM PROPERTY LTD
|
||||
owner_too_big_ids = owner_big_enough["Company Registration No. (1)"].unique()
|
||||
owner_too_big_names = owner_big_enough["Proprietor Name (1)"].unique()
|
||||
previous_owner_size = previous_owner_sizes[
|
||||
previous_owner_sizes["Company Registration No. (1)"].isin(owner_too_big_ids)
|
||||
]
|
||||
new_owner_size = new_matched_addresses[
|
||||
new_matched_addresses["Company Registration No. (1)"].isin(owner_too_big_ids) |
|
||||
new_matched_addresses["Proprietor Name (1)"].isin(owner_too_big_names)
|
||||
]
|
||||
|
||||
n_unsold = new_owner_size[~new_owner_size["sold_recently"] & ~new_owner_size["sale_lodged_recently"]].shape
|
||||
|
||||
# Happy with the justification to this point
|
||||
assert (
|
||||
(n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) ==
|
||||
dropped_units_matched.shape[0]
|
||||
)
|
||||
|
||||
# We now have a list of properties that were lost from the previous iteration to the next that were not matched
|
||||
dropped_units_unmatched = dropped_units[
|
||||
~dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
|
||||
].copy()
|
||||
|
||||
# A few possibilities: They aren't in the EPC data?
|
||||
new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
|
||||
unmatched_not_in_epc = dropped_units_unmatched[
|
||||
~dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
|
||||
]
|
||||
# There are 17 units that have had new EPCs above a G
|
||||
# Who were the owners? - various, nothing particularly remarkable
|
||||
(
|
||||
previous_portfolio[
|
||||
previous_portfolio["UPRN"].isin(unmatched_not_in_epc["UPRN"])
|
||||
]["Proprietor Name (1)"].value_counts()
|
||||
)
|
||||
|
||||
# 22 final units to be accounted for...!
|
||||
unmatched_in_epc = dropped_units_unmatched[
|
||||
dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
|
||||
]
|
||||
|
||||
# Some of them will be due to ownership
|
||||
# TODO: Read in freehold/leashold data and see how many of these were non-exact matches!
|
||||
leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
|
||||
freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
|
||||
combined_matching_lookup = pd.concat([leasehold_matching_lookup, freehold_matching_lookup])
|
||||
# THis is 13 matches, all of them approximate
|
||||
weak_matches = unmatched_in_epc.merge(combined_matching_lookup, how="inner", on="UPRN")
|
||||
|
||||
# These have been lost due to ownership updates. This has been checked manually for every unit and there has
|
||||
# been sale activity for each one, justifying the change in ownership data
|
||||
remaining_matches = unmatched_in_epc[
|
||||
~unmatched_in_epc["UPRN"].isin(weak_matches["UPRN"])
|
||||
]
|
||||
|
||||
assert dropped_units.shape[0] == (
|
||||
(n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) + len(
|
||||
weak_matches) + unmatched_not_in_epc.shape[0]
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue