setting up code for gla proposal

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-10 18:12:29 +01:00
parent ecf05369dd
commit 2d7e9a3cc9
3 changed files with 88 additions and 7 deletions

View file

View file

@ -0,0 +1,76 @@
"""
This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant
"""
from nis import match
import pandas as pd
from etl.ownership.Ownership import Ownership
postcodes = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes'
)
# Take just the first two columns
postcodes = postcodes[
['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1']
]
postcodes.columns = ['postcode', 'Local Authority']
# Drop the first row
postcodes = postcodes.drop([0, 1])
# Since there are a large number of potcodes (425k), let's just take a few examples
# Take postcodes that begin with "BN15"
postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
# The Local Authority is Adur, so let's get the EPC data for this area
# epc_data = pd.read_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur"
# "/certificates.csv", low_memory=False
# )
# # Filter on these postcodes
# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())]
# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE
# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"])
# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
#
# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total
# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index()
# ratings_distribution.columns = ["Rating", "Count"]
# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100
# Can we identify the owners of these units so we can contact them?
ownership = Ownership(
epc_paths=[
"/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223"
"-Adur/certificates.csv"
],
domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv",
overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv",
land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv",
project_name="gla-proposal",
bucket="retrofit-data-dev",
average_property_value=0,
portfolio_value=0,
excluded_owners=[],
excluded_uprns=[],
save=False
)
# Data will be found at ownership/gla-proposal
ownership.source_epc_properties(column_filters={})
# Step 2: Get company ownership data
ownership.load_company_ownership()
# Step 3: Prepare data for matching
ownership.prepare_for_matching()
# Step 4: Match EPC data to ownership data
ownership.match()
# We have the matches, which we now need to match to the postcodes
matches = ownership.matched_addresses.copy()
# filter matches on the postcodes we're interested in
matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())]
# Remove any social transactions
matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])]

View file

@ -61,6 +61,7 @@ class Ownership:
portfolio_value: float,
excluded_owners: List[str] = None,
excluded_uprns: List[int] = None,
save=True
):
"""
@ -115,6 +116,8 @@ class Ownership:
f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx"
)
self.save = save
# Data
self.epc_data = None
self.ownership_data = None
@ -210,12 +213,13 @@ class Ownership:
if self.excluded_uprns:
self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)]
# We now store the data in s3
save_excel_to_s3(
df=self.epc_data,
bucket_name=self.bucket,
file_key=self.epc_data_filepath
)
if self.save:
# We now store the data in s3
save_excel_to_s3(
df=self.epc_data,
bucket_name=self.bucket,
file_key=self.epc_data_filepath
)
def load_company_ownership(self):
"""
@ -590,7 +594,8 @@ class Ownership:
"CURRENT_ENERGY_RATING",
"POSTCODE",
"LODGEMENT_DATE",
"TRANSACTION_TYPE"
"TRANSACTION_TYPE",
"TENURE",
]
].rename(
columns={