mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
wave 3 applications closed
This commit is contained in:
parent
f6612c0cd4
commit
fff8f50f69
7 changed files with 191 additions and 11 deletions
64
etl/customers/cottons/parse_pdf_asset_list.py
Normal file
64
etl/customers/cottons/parse_pdf_asset_list.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import re
|
||||
import pandas as pd
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
# Paths to the uploaded files
|
||||
file_paths = [
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
|
||||
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
|
||||
]
|
||||
|
||||
|
||||
# Function to extract text from PDFs
|
||||
def extract_text_from_pdf_with_pypdf2(file_path):
|
||||
text = ""
|
||||
reader = PdfReader(file_path)
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
return text
|
||||
|
||||
|
||||
# Initialize a list to hold all parsed data
|
||||
all_parsed_data = []
|
||||
|
||||
# Process each PDF individually
|
||||
for i, path in enumerate(file_paths):
|
||||
# Extract text from the PDF
|
||||
extracted_text = extract_text_from_pdf_with_pypdf2(path)
|
||||
|
||||
# Step 1: Remove titles and repeated headers
|
||||
cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
|
||||
cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
|
||||
|
||||
# Step 2: Extract rows ending with "Managed"
|
||||
rows = re.findall(r".*?Managed", cleaned_text)
|
||||
|
||||
# Step 3: Parse rows into structured data
|
||||
parsed_data = []
|
||||
for row in rows:
|
||||
match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
|
||||
if match:
|
||||
code = match.group(1).strip()
|
||||
address = match.group(2).strip()
|
||||
parsed_data.append((code, address, "Managed"))
|
||||
|
||||
# Append parsed data to the global list
|
||||
all_parsed_data.extend(parsed_data)
|
||||
|
||||
# Provide feedback for debugging
|
||||
print(f"File {i + 1} processed: {len(parsed_data)} rows")
|
||||
|
||||
# Step 4: Create a unified DataFrame
|
||||
final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
|
||||
|
||||
# Step 5: Save the unified DataFrame to an Excel file
|
||||
final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
|
||||
final_df.to_excel(final_output_file_path, index=False)
|
||||
|
||||
# Provide feedback
|
||||
print(f"All files processed and combined. Total rows: {len(final_df)}")
|
||||
print(f"Unified file saved to: {final_output_file_path}")
|
||||
15
etl/customers/cottons/prep_asset_list.py
Normal file
15
etl/customers/cottons/prep_asset_list.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import pandas as pd
|
||||
|
||||
df = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx"
|
||||
)
|
||||
|
||||
# split up the address on commas. First section is address1, last seciton is postcode
|
||||
df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip())
|
||||
df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip())
|
||||
|
||||
# Re-save
|
||||
df.to_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx",
|
||||
index=False,
|
||||
)
|
||||
46
etl/customers/gla/hug_postcodes.py
Normal file
46
etl/customers/gla/hug_postcodes.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
import inspect
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
|
||||
src_file_path = inspect.getfile(lambda: None)
|
||||
|
||||
EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates")
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
aggregation = []
|
||||
for directory in tqdm(epc_directories):
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
||||
|
||||
data = data[data["posttown"].str.contains("London", case=False, na=False)]
|
||||
if data.empty:
|
||||
continue
|
||||
# Take just date before the date threshold
|
||||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
data = data[~pd.isnull(data["uprn"])]
|
||||
# Take just the newest EPC per uprn, based on lodgement-date
|
||||
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
|
||||
# Take EPC D and below
|
||||
data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])]
|
||||
data["postal_region"] = data["postcode"].str.split(" ").str[0]
|
||||
|
||||
# Take homes that don't have a gas boiler
|
||||
off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)]
|
||||
|
||||
region_summary = off_gas.groupby("postal_region").size().reset_index(name="count")
|
||||
|
||||
aggregation.append(region_summary)
|
||||
|
||||
postal_region_aggregation = pd.concat(aggregation)
|
||||
postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False)
|
||||
postal_region_aggregation = postal_region_aggregation.rename(
|
||||
columns={"postal_region": "Postcode Region", "count": "Number of Homes"}
|
||||
)
|
||||
postal_region_aggregation.to_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx",
|
||||
index=False
|
||||
)
|
||||
|
|
@ -305,7 +305,7 @@ def caha():
|
|||
|
||||
# Get conservation area data
|
||||
uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
|
||||
conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev")
|
||||
conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev")
|
||||
|
||||
addresses = pd.DataFrame(asset_list)
|
||||
addresses["uprn"] = addresses["uprn"].astype(str)
|
||||
|
|
|
|||
|
|
@ -2591,5 +2591,21 @@ def propsed_wave_3_sample():
|
|||
os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
|
||||
)
|
||||
|
||||
survey_results = pd.read_excel(
|
||||
os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
|
||||
header=13,
|
||||
sheet_name="Modelled Packages"
|
||||
)
|
||||
|
||||
indivual_units = pd.read_csv(
|
||||
os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv")
|
||||
)
|
||||
|
||||
u_aids = survey_results["Archetype ID"].astype(str).unique()
|
||||
units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values
|
||||
|
||||
len({v for v in units_in_bid if str(v) in u_aids})
|
||||
len(list(set(units_in_bid)))
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
|
|
|
|||
|
|
@ -375,3 +375,41 @@ def app():
|
|||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
|
||||
index=False
|
||||
)
|
||||
|
||||
|
||||
def cross_reference_epc_programme():
|
||||
eco3_fallout = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
|
||||
"SURVEYED - ECO3 NOT COMPLETED.xlsx"
|
||||
)
|
||||
|
||||
eco3_fallout["house_number"] = eco3_fallout.apply(
|
||||
lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
|
||||
)
|
||||
|
||||
# for _, x in eco3_fallout.ite
|
||||
|
||||
stonewater_modelled_above_c = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
|
||||
"master sheet.csv",
|
||||
encoding='latin1'
|
||||
)
|
||||
|
||||
stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
|
||||
lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
|
||||
)
|
||||
|
||||
eco3_fallout_matched_to_above_c = []
|
||||
for _, property in eco3_fallout.iterrows():
|
||||
# Match on house number
|
||||
match = stonewater_modelled_above_c[
|
||||
stonewater_modelled_above_c["house_number"] == property["house_number"]
|
||||
]
|
||||
|
||||
# We do a fuzzy match on the address, with levenstein distance
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
match = stonewater_modelled_above_c[
|
||||
stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
|
||||
]
|
||||
match.head()
|
||||
|
|
|
|||
|
|
@ -120,17 +120,17 @@ def app():
|
|||
Property UPRN
|
||||
|
||||
"""
|
||||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
|
||||
DATA_FILENAME = "Bromford programme review.xlsx"
|
||||
SHEET_NAME = "Bromford"
|
||||
POSTCODE_COLUMN = "Postcode"
|
||||
FULLADDRESS_COLUMN = None
|
||||
ADDRESS1_COLUMN = "No."
|
||||
ADDRESS1_METHOD = "first_two_words"
|
||||
ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
|
||||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/"
|
||||
DATA_FILENAME = "Cottons Asset List.xlsx"
|
||||
SHEET_NAME = "Sheet1"
|
||||
POSTCODE_COLUMN = "postcode"
|
||||
FULLADDRESS_COLUMN = "Property Address"
|
||||
ADDRESS1_COLUMN = "address1"
|
||||
ADDRESS1_METHOD = None
|
||||
ADDRESS_COLS_TO_CONCAT = []
|
||||
|
||||
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
|
||||
asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
|
||||
# asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
|
||||
asset_list["row_id"] = asset_list.index
|
||||
|
||||
# We clean up portential non-breaking spaces, and double spaces
|
||||
|
|
@ -202,7 +202,8 @@ def app():
|
|||
|
||||
transformed_df = pd.DataFrame(transformed_data)
|
||||
# Drop the column that is ""
|
||||
transformed_df = transformed_df.drop(columns=[""])
|
||||
if "" in transformed_df.columns:
|
||||
transformed_df = transformed_df.drop(columns=[""])
|
||||
|
||||
# Get the find my epc data
|
||||
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue