wave 3 applications closed

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-27 10:16:06 +00:00
parent f6612c0cd4
commit fff8f50f69
7 changed files with 191 additions and 11 deletions

View file

@ -0,0 +1,64 @@
import re
import pandas as pd
from PyPDF2 import PdfReader
# Paths to the uploaded files
file_paths = [
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
]
# Function to extract text from PDFs
def extract_text_from_pdf_with_pypdf2(file_path):
text = ""
reader = PdfReader(file_path)
for page in reader.pages:
text += page.extract_text()
return text
# Initialize a list to hold all parsed data
all_parsed_data = []
# Process each PDF individually
for i, path in enumerate(file_paths):
# Extract text from the PDF
extracted_text = extract_text_from_pdf_with_pypdf2(path)
# Step 1: Remove titles and repeated headers
cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
# Step 2: Extract rows ending with "Managed"
rows = re.findall(r".*?Managed", cleaned_text)
# Step 3: Parse rows into structured data
parsed_data = []
for row in rows:
match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
if match:
code = match.group(1).strip()
address = match.group(2).strip()
parsed_data.append((code, address, "Managed"))
# Append parsed data to the global list
all_parsed_data.extend(parsed_data)
# Provide feedback for debugging
print(f"File {i + 1} processed: {len(parsed_data)} rows")
# Step 4: Create a unified DataFrame
final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
# Step 5: Save the unified DataFrame to an Excel file
final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
final_df.to_excel(final_output_file_path, index=False)
# Provide feedback
print(f"All files processed and combined. Total rows: {len(final_df)}")
print(f"Unified file saved to: {final_output_file_path}")

View file

@ -0,0 +1,15 @@
import pandas as pd
df = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx"
)
# split up the address on commas. First section is address1, last seciton is postcode
df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip())
df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip())
# Re-save
df.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx",
index=False,
)

View file

@ -0,0 +1,46 @@
import inspect
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from etl.epc.settings import EARLIEST_EPC_DATE
src_file_path = inspect.getfile(lambda: None)
EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates")
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
aggregation = []
for directory in tqdm(epc_directories):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
data = data[data["posttown"].str.contains("London", case=False, na=False)]
if data.empty:
continue
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
# Take just the newest EPC per uprn, based on lodgement-date
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
# Take EPC D and below
data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])]
data["postal_region"] = data["postcode"].str.split(" ").str[0]
# Take homes that don't have a gas boiler
off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)]
region_summary = off_gas.groupby("postal_region").size().reset_index(name="count")
aggregation.append(region_summary)
postal_region_aggregation = pd.concat(aggregation)
postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False)
postal_region_aggregation = postal_region_aggregation.rename(
columns={"postal_region": "Postcode Region", "count": "Number of Homes"}
)
postal_region_aggregation.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx",
index=False
)

View file

@ -305,7 +305,7 @@ def caha():
# Get conservation area data
uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev")
conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev")
addresses = pd.DataFrame(asset_list)
addresses["uprn"] = addresses["uprn"].astype(str)

View file

@ -2591,5 +2591,21 @@ def propsed_wave_3_sample():
os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
)
survey_results = pd.read_excel(
os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
header=13,
sheet_name="Modelled Packages"
)
indivual_units = pd.read_csv(
os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv")
)
u_aids = survey_results["Archetype ID"].astype(str).unique()
units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values
len({v for v in units_in_bid if str(v) in u_aids})
len(list(set(units_in_bid)))
# if __name__ == "__main__":
# main()

View file

@ -375,3 +375,41 @@ def app():
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
index=False
)
def cross_reference_epc_programme():
eco3_fallout = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
"SURVEYED - ECO3 NOT COMPLETED.xlsx"
)
eco3_fallout["house_number"] = eco3_fallout.apply(
lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
)
# for _, x in eco3_fallout.ite
stonewater_modelled_above_c = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
"master sheet.csv",
encoding='latin1'
)
stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
)
eco3_fallout_matched_to_above_c = []
for _, property in eco3_fallout.iterrows():
# Match on house number
match = stonewater_modelled_above_c[
stonewater_modelled_above_c["house_number"] == property["house_number"]
]
# We do a fuzzy match on the address, with levenstein distance
from fuzzywuzzy import fuzz
match = stonewater_modelled_above_c[
stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
]
match.head()

View file

@ -120,17 +120,17 @@ def app():
Property UPRN
"""
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
DATA_FILENAME = "Bromford programme review.xlsx"
SHEET_NAME = "Bromford"
POSTCODE_COLUMN = "Postcode"
FULLADDRESS_COLUMN = None
ADDRESS1_COLUMN = "No."
ADDRESS1_METHOD = "first_two_words"
ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/"
DATA_FILENAME = "Cottons Asset List.xlsx"
SHEET_NAME = "Sheet1"
POSTCODE_COLUMN = "postcode"
FULLADDRESS_COLUMN = "Property Address"
ADDRESS1_COLUMN = "address1"
ADDRESS1_METHOD = None
ADDRESS_COLS_TO_CONCAT = []
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
# asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
asset_list["row_id"] = asset_list.index
# We clean up portential non-breaking spaces, and double spaces
@ -202,7 +202,8 @@ def app():
transformed_df = pd.DataFrame(transformed_data)
# Drop the column that is ""
transformed_df = transformed_df.drop(columns=[""])
if "" in transformed_df.columns:
transformed_df = transformed_df.drop(columns=[""])
# Get the find my epc data
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(