mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
173 lines
7 KiB
Python
173 lines
7 KiB
Python
"""
|
|
This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant
|
|
"""
|
|
|
|
import inspect
|
|
import requests
|
|
import json
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from etl.ownership.Ownership import Ownership
|
|
|
|
postcodes = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes'
|
|
)
|
|
# Take just the first three columns
|
|
postcodes = postcodes[
|
|
['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2']
|
|
]
|
|
|
|
postcodes.columns = ['postcode', 'Local Authority', 'London Borough?']
|
|
# Drop the first row
|
|
postcodes = postcodes.drop([0, 1])
|
|
# Take just the London Boroughs
|
|
postcodes = postcodes[postcodes["London Borough?"] == "Yes"]
|
|
# Since there are a large number of potcodes (425k), let's just take a few examples
|
|
# Take postcodes that begin with "BN15"
|
|
# postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
|
|
|
|
# The Local Authority is Adur, so let's get the EPC data for this area
|
|
# epc_data = pd.read_csv(
|
|
# "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur"
|
|
# "/certificates.csv", low_memory=False
|
|
# )
|
|
# # Filter on these postcodes
|
|
# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())]
|
|
# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
|
|
# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE
|
|
# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"])
|
|
# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
|
|
#
|
|
# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total
|
|
# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index()
|
|
# ratings_distribution.columns = ["Rating", "Count"]
|
|
# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100
|
|
|
|
# Can we identify the owners of these units so we can contact them?
|
|
|
|
file_src = inspect.getfile(lambda x: None)
|
|
DATA_DIRECTORY = Path(file_src).parent / "local_data" / "all-domestic-certificates"
|
|
epc_paths = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
epc_paths = [str(entry / "certificates.csv") for entry in epc_paths]
|
|
|
|
ownership = Ownership(
|
|
epc_paths=epc_paths,
|
|
domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv",
|
|
overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv",
|
|
land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv",
|
|
project_name="gla-proposal",
|
|
bucket="retrofit-data-dev",
|
|
average_property_value=0,
|
|
portfolio_value=0,
|
|
excluded_owners=[],
|
|
excluded_uprns=[],
|
|
save=True
|
|
)
|
|
|
|
# Data will be found at ownership/gla-proposal
|
|
ownership.source_epc_properties(column_filters={}, postcodes=postcodes["postcode"].str.lower().tolist())
|
|
|
|
# Step 2: Get company ownership data
|
|
ownership.load_company_ownership()
|
|
|
|
# Step 3: Prepare data for matching
|
|
ownership.prepare_for_matching()
|
|
|
|
# Step 4: Match EPC data to ownership data
|
|
ownership.match()
|
|
|
|
from utils.s3 import save_excel_to_s3, read_excel_from_s3
|
|
|
|
# Save the data to S3
|
|
# save_excel_to_s3(
|
|
# df=ownership.matched_addresses,
|
|
# bucket_name=ownership.bucket,
|
|
# file_key=ownership.matched_addresses_pre_filter_filepath
|
|
# )
|
|
|
|
# Read in matches
|
|
matches = read_excel_from_s3(
|
|
bucket_name=ownership.bucket,
|
|
file_key="ownership/gla-proposal/2024-10-10 19:02:34.131365/matched_addresses_pre_filter.xlsx",
|
|
header_row=0
|
|
)
|
|
|
|
# We have the matches, which we now need to match to the postcodes
|
|
matches = ownership.matched_addresses.copy()
|
|
# filter matches on the postcodes we're interested in
|
|
matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())]
|
|
# Remove any social transactions
|
|
matches = matches[~matches["TENURE"].isin(
|
|
["Rented (social)", "rental (social)",
|
|
"Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be "
|
|
"used for an existing dwelling", "NO DATA!"])
|
|
]
|
|
matches["is_prs"] = matches["TENURE"].isin(["rental (private)", "Rented (private)"])
|
|
# Look at the EPC ratings
|
|
epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index()
|
|
epc_ratings.columns = ["EPC Rating", "Count"]
|
|
epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100
|
|
|
|
# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds
|
|
matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])]
|
|
# 11,694 properties
|
|
matches["epc_postcode"].nunique()
|
|
# 6899
|
|
|
|
owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index()
|
|
owners_count.columns = ['Owner', 'Owner Registration #', 'Count']
|
|
owners_count = owners_count.sort_values('Count', ascending=False)
|
|
owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100
|
|
|
|
# Take an example postal region
|
|
matches = matches.sort_values("epc_postcode", ascending=True)
|
|
# BR1, BR5
|
|
example = matches[matches["epc_postcode"].str.startswith("CR0 ")].copy()
|
|
example = example[example["TENURE"].isin(["rental (private)", "Rented (private)"])]
|
|
|
|
pd.set_option('display.max_rows', 500)
|
|
pd.set_option('display.max_columns', 500)
|
|
pd.set_option('display.width', 1000)
|
|
example[
|
|
["epc_address", "epc_postcode", "CURRENT_ENERGY_RATING", "CURRENT_ENERGY_EFFICIENCY", "Proprietor Name (1)",
|
|
"Company Registration No. (1)"]
|
|
].head(4)
|
|
|
|
ownership.epc_data["UPRN"] = ownership.epc_data["UPRN"].astype(int)
|
|
example = example.merge(
|
|
ownership.epc_data[["UPRN", "BUILT_FORM", "PROPERTY_TYPE", "WALLS_DESCRIPTION", "ROOF_DESCRIPTION"]],
|
|
on="UPRN",
|
|
how="left"
|
|
)
|
|
z = example[example["CURRENT_ENERGY_RATING"] == "E"]
|
|
z = z[z["TENURE"].isin(["rental (private)", "Rented (private)"])]
|
|
|
|
companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f"
|
|
|
|
company_number = example.head(1)["Company Registration No. (1)"].values[0]
|
|
url = f'https://api.company-information.service.gov.uk/company/{company_number}'
|
|
|
|
# Make the API request
|
|
response = requests.get(url, auth=(companies_house_api_key, ''))
|
|
|
|
# Check if the request was successful
|
|
if response.status_code == 200:
|
|
company_data = response.json()
|
|
# Pretty-print the fetched data
|
|
print(json.dumps(company_data, indent=4))
|
|
else:
|
|
print(f"Failed to fetch data. Status code: {response.status_code}")
|
|
# Try appending a zero the beginning of the company number
|
|
company_number = f"0{company_number}"
|
|
url = f'https://api.company-information.service.gov.uk/company/{company_number}'
|
|
response = requests.get(url, auth=(companies_house_api_key, ''))
|
|
company_data = response.json()
|
|
|
|
from pprint import pprint
|
|
|
|
pprint(company_data)
|
|
|
|
psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control'
|
|
psc_response = requests.get(psc_url, auth=(companies_house_api_key, ''))
|
|
psc_data = psc_response.json()
|
|
pprint(psc_data)
|