From 39bc6c53b867c66601042c313649f912adaae8d7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 1 Aug 2024 12:21:10 +0100 Subject: [PATCH] sfr investiation in progress --- etl/customers/goldman/property_ownership.py | 254 +++++++++++++++++++- 1 file changed, 248 insertions(+), 6 deletions(-) diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py index ebd72732..c1f37d4c 100644 --- a/etl/customers/goldman/property_ownership.py +++ b/etl/customers/goldman/property_ownership.py @@ -75,10 +75,15 @@ def find_f_g_properties(paths): epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str) - # Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this - epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], format='mixed', errors="coerce") + if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum(): + raise Exception("wtf") - epc_data = epc_data.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN") + # Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this + epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce") + + epc_data = epc_data.sort_values( + ["LODGEMENT_DATE", "LODGEMENT_DATETIME"], ascending=False + ).drop_duplicates("UPRN") # Get G & F properties epc_data = epc_data[epc_data["CURRENT_ENERGY_RATING"].isin(["G", "F"])] @@ -401,6 +406,8 @@ def app(): ~company_ownership["Property Address"].str.lower().str.startswith(starting_term) ] + # address = properties[properties["UPRN"] == 100030253055].squeeze() + freehold_matching_lookup = [] # 634 leasehold_matching_lookup = [] # 86 shared_leasehold_match = [] @@ -493,12 +500,18 @@ def app(): # freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx") # leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx") + # freehold_matching_lookup.shape + # (1537, 4) + # leasehold_matching_lookup.shape + # (390, 4) + # The approximate matches aren't very good freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"] leasehold_matching_lookup = leasehold_matching_lookup[leasehold_matching_lookup["match_type"] == "exact"] # Combine combined_matching_lookup = pd.concat([freehold_matching_lookup, leasehold_matching_lookup]) + # Remove duplicates combined_matching_lookup = remove_duplicate_matches( matching_lookup=combined_matching_lookup, properties=properties, company_ownership=company_ownership @@ -566,7 +579,6 @@ def app(): land_registry_matches = [] for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)): - # Filter land registry on the postcode lr_filtered = land_registry[ (land_registry["postcode"] == match["epc_postcode"].lower().strip()) @@ -782,7 +794,7 @@ def app(): right_on="uprn" ).drop(columns=["uprn"]) - # Flat anything that sold in the last year + # Flag anything that sold in the last year matched_addresses["sold_recently"] = ( matched_addresses["date_of_transfer"] >= pd.Timestamp.now() - pd.DateOffset(years=1) ) @@ -792,6 +804,9 @@ def app(): (matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"])) ) + # Save this + # matched_addresses.to_excel("combined_aggregate - pre filter 28th July.xlsx", index=False) + # Drop rows on the booleans matched_addresses = matched_addresses[ ~matched_addresses["sold_recently"] & @@ -835,7 +850,7 @@ def app(): # investment_50m_properties.to_excel("investment_50m_properties 28th July.xlsx", index=False) # Store the EPC data - # portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 29th July.xlsx", index=False) + # portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 28th July.xlsx", index=False) # We check if any of these properties are in a conservation area valuations = pd.read_excel("property value.xlsx") @@ -997,3 +1012,230 @@ def prepare_anonymised_data(): ) df.to_excel("Property List - 50% redacted.xlsx", index=False) + + +def adhoc_change_of_portfolio_analysis_july_2024(): + """ + This is just some adhoc analysis, which answers some questions which arose upon refreshing the SFR portfolio + in late July 2024 + :return: + """ + + # Question 1: Which properties in the previous portfolio were in conservation areas or had listed/heritage status? + def answer_q1(): + # Data was just stored here: + geospatial_data = pd.read_excel("geospatial_data.xlsx") + + special_buildings = geospatial_data[ + (geospatial_data["conservation_status"] == 1) | + geospatial_data["is_listed_building"] | + geospatial_data["is_heritage_building"] + ] + + print( + f"There were {special_buildings.shape[0]} properties in the previous portfolio which were in conservation " + f"areas or had listed/heritage status" + ) + print(f"{(special_buildings['conservation_status'] == 1).sum()} were in a conservation area") + print(f"{special_buildings['is_listed_building'].sum()} were listed buildings") + print(f"{special_buildings['is_heritage_building'].sum()} were heritage buildings") + + answer_q1() + + # Question 2: For each property in the old portfolio, why was it lost? + def answer_q2(): + # We read in the previous 50m portfolio + previous_portfolio = pd.read_excel("investment_50m_properties 28th May.xlsx") # 39 owners + + new_matched_addresses = pd.read_excel("combined_aggregate - pre filter 28th July.xlsx") + new_portfolio = pd.read_excel("investment_50m_properties 28th July.xlsx") # 69 owners + + # dropped units + dropped_units = previous_portfolio[ + ~previous_portfolio["UPRN"].isin(new_portfolio["UPRN"].values) + ] + # Lots of properties are missed out - why + # 1) What was dropped, but was in the matched addresses and therefore was maybe filtered out + dropped_units_matched = dropped_units[ + dropped_units["UPRN"].isin(new_matched_addresses["UPRN"]) + ].copy() + + dropped_units_matched = dropped_units_matched.merge( + new_matched_addresses[ + ["UPRN", 'transaction_id', 'price', 'date_of_transfer', 'sold_recently', 'sale_lodged_recently'] + ], + how="left", on="UPRN" + ) + + # 97 units here - how mant were sold + of_which_sold = dropped_units_matched[ + dropped_units_matched["sold_recently"] + ] + n_sold = of_which_sold.shape[0] + print(f"{n_sold} sold recently ({n_sold / previous_portfolio.shape[0] * 100})%") + + of_which_have_sale_epc_but_not_sold = dropped_units_matched[ + ~dropped_units_matched["sold_recently"] & dropped_units_matched["sale_lodged_recently"] + ] + n_with_sale_epc_but_not_yet_sold = of_which_have_sale_epc_but_not_sold.shape[0] + print( + f"{n_with_sale_epc_but_not_yet_sold} have a sale EPC but have not sold yet (" + f"{n_with_sale_epc_but_not_yet_sold / previous_portfolio.shape[0] * 100})%" + ) + + # What about things that haven't sold or don't look likely to sell + not_sold = dropped_units_matched[ + ~dropped_units_matched["sold_recently"] & ~dropped_units_matched["sale_lodged_recently"] + ] + + new_owner_sizes = new_portfolio.groupby( + ["Company Registration No. (1)"] + ).size().reset_index().rename(columns={0: "Number of Properties"}) + new_owner_sizes = new_owner_sizes.sort_values("Number of Properties", ascending=False) + + previous_owner_sizes = previous_portfolio.groupby( + ["Company Registration No. (1)"] + ).size().reset_index().rename(columns={0: "Number of Properties"}) + previous_owner_sizes = previous_owner_sizes.sort_values("Number of Properties", ascending=False) + + # Let's just confirm that we took in a bigger owner, as we see this unit was still matched + owner_too_small = [] + owner_big_enough = [] + for _, property in not_sold.iterrows(): + owner_reg_id = property["Company Registration No. (1)"] + old_portfolio_owner_size = previous_owner_sizes[ + previous_owner_sizes["Company Registration No. (1)"] == owner_reg_id + ] + # We make sure that the number of properties is smaller than the new smallest number + if ( + old_portfolio_owner_size["Number of Properties"].values[0] > + new_owner_sizes["Number of Properties"].min() + ): + owner_big_enough.append(property.to_dict()) + continue + + owner_too_small.append(property.to_dict()) + + n_owner_too_small = len(owner_too_small) + owner_big_enough = pd.DataFrame(owner_big_enough) + + summary = [] + for _, record in owner_big_enough.iterrows(): + # Do we have this new owner? + new_owner = new_portfolio[ + new_portfolio["Company Registration No. (1)"] == record["Company Registration No. (1)"] + ] + if new_owner.empty: + # Why don't we have this new owner + new_owner_data = new_matched_addresses[ + new_matched_addresses["Company Registration No. (1)"] == record["Company Registration No. (1)"] + ] + + new_owner_data_filtered = new_owner_data[ + ~new_owner_data["sold_recently"] & ~new_owner_data["sale_lodged_recently"] + ] + + summary.append( + { + "Owner Name": record["Proprietor Name (1)"], + "Owner reg id": record["Company Registration No. (1)"], + "N properties in new portfolio before filtering": new_owner_data.shape[0], + "N properties in new portfolio after filtering": new_owner_data_filtered.shape[0], + } + + ) + continue + raise Exception("something went wrong") + + summary = pd.DataFrame(summary) + + not_accounted_for = summary[ + ( + summary["N properties in new portfolio before filtering"] < + previous_owner_sizes["Number of Properties"].min() + ) + ] + + # We have two owners not accounted for: + # ALLMID LIMITED, 01959058 + # CORAL RACING LIMITED, 541600 + # What happened to these owners? + new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx") + allmid = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "01959058"].copy() + # Check if any of the properties are not in the new EPC data + allmid["not_in_new_epc"] = ~allmid["UPRN"].isin(new_epc["UPRN"]) + allmid["not_in_matched_pre_filtered"] = ~allmid["UPRN"].isin(new_matched_addresses["UPRN"]) + # In the previous portfolio, Allmid had 4 properties and in the re-build, it has just 2. Why? + # Firstly, one of their properties was re-surveyed not at an F/G + # Secondly, one of their properties is no longer owned by them: + # https://www.zoopla.co.uk/property/uprn/100070553074/ + # So as an owner, they fell out of the ranking + coral_racing = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "541600"].copy() + coral_racing["not_in_new_epc"] = ~coral_racing["UPRN"].isin(new_epc["UPRN"]) + coral_racing["not_in_matched_pre_filtered"] = ~coral_racing["UPRN"].isin(new_matched_addresses["UPRN"]) + # Coral goes down from 4 -> 1 on refresh, so what happened? + # 1) 2 properties had new EPCs and re-scored higher + # 2) 1 property, 85A Market Street, Church Gresley, Swadlincote, DE11 9PN is no longer matched to the ownership + # data, which is correct + + # Why were these units lost? + # There's just 1 owner, who is BARHAM PROPERTY LTD + owner_too_big_ids = owner_big_enough["Company Registration No. (1)"].unique() + owner_too_big_names = owner_big_enough["Proprietor Name (1)"].unique() + previous_owner_size = previous_owner_sizes[ + previous_owner_sizes["Company Registration No. (1)"].isin(owner_too_big_ids) + ] + new_owner_size = new_matched_addresses[ + new_matched_addresses["Company Registration No. (1)"].isin(owner_too_big_ids) | + new_matched_addresses["Proprietor Name (1)"].isin(owner_too_big_names) + ] + + n_unsold = new_owner_size[~new_owner_size["sold_recently"] & ~new_owner_size["sale_lodged_recently"]].shape + + # Happy with the justification to this point + assert ( + (n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) == + dropped_units_matched.shape[0] + ) + + # We now have a list of properties that were lost from the previous iteration to the next that were not matched + dropped_units_unmatched = dropped_units[ + ~dropped_units["UPRN"].isin(new_matched_addresses["UPRN"]) + ].copy() + + # A few possibilities: They aren't in the EPC data? + new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx") + unmatched_not_in_epc = dropped_units_unmatched[ + ~dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"]) + ] + # There are 17 units that have had new EPCs above a G + # Who were the owners? - various, nothing particularly remarkable + ( + previous_portfolio[ + previous_portfolio["UPRN"].isin(unmatched_not_in_epc["UPRN"]) + ]["Proprietor Name (1)"].value_counts() + ) + + # 22 final units to be accounted for...! + unmatched_in_epc = dropped_units_unmatched[ + dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"]) + ] + + # Some of them will be due to ownership + # TODO: Read in freehold/leashold data and see how many of these were non-exact matches! + leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx") + freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx") + combined_matching_lookup = pd.concat([leasehold_matching_lookup, freehold_matching_lookup]) + # THis is 13 matches, all of them approximate + weak_matches = unmatched_in_epc.merge(combined_matching_lookup, how="inner", on="UPRN") + + # These have been lost due to ownership updates. This has been checked manually for every unit and there has + # been sale activity for each one, justifying the change in ownership data + remaining_matches = unmatched_in_epc[ + ~unmatched_in_epc["UPRN"].isin(weak_matches["UPRN"]) + ] + + assert dropped_units.shape[0] == ( + (n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) + len( + weak_matches) + unmatched_not_in_epc.shape[0] + )