mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
mount home directory to devcontainer home directory
This commit is contained in:
parent
b7e201f3d4
commit
d4ac6aee71
5 changed files with 76 additions and 176 deletions
|
|
@ -6,7 +6,7 @@
|
|||
"workspaceFolder": "/workspaces/model",
|
||||
"postStartCommand": "bash .devcontainer/backend/post-install.sh",
|
||||
"mounts": [
|
||||
"source=${localEnv:HOME},target=/workspaces/home,type=bind"
|
||||
"source=${localEnv:HOME},target=/home/vscode,type=bind"
|
||||
],
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
|||
logger = setup_logger()
|
||||
|
||||
# OpenAI API Key (set this in your environment variables for security)
|
||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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")
|
||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -13,11 +13,15 @@ from asset_list.utils import get_data
|
|||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
load_dotenv(dotenv_path="../backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv(
|
||||
"EPC_AUTH_TOKEN",
|
||||
)
|
||||
|
||||
OPENAI_API_KEY = os.getenv(
|
||||
"OPENAI_API_KEY",
|
||||
)
|
||||
|
||||
|
||||
def extract_address1(
|
||||
asset_list, full_address_col, postcode_col, method="first_two_words"
|
||||
|
|
@ -69,72 +73,24 @@ def app():
|
|||
Property UPRN
|
||||
"""
|
||||
|
||||
<<<<<<< HEAD
|
||||
data_folder = "/workspaces/model/asset_list/"
|
||||
data_filename = "manchester.xlsx"
|
||||
sheet_name = "PW0099 - Property List"
|
||||
postcode_column = "post Code"
|
||||
address1_column = "address"
|
||||
address1_method = None
|
||||
fulladdress_column = None
|
||||
address_cols_to_concat = ["address"]
|
||||
=======
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Aspire"
|
||||
data_filename = "ASPIRE ASSET LIST.xlsx"
|
||||
sheet_name = "Asset List"
|
||||
postcode_column = "Postcode"
|
||||
data_folder = "/workspaces/model/asset_list"
|
||||
data_filename = "assets.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = "POSTCODE"
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
fulladdress_column = "Address"
|
||||
fulladdress_column = "ADDRESS"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = "Property Type"
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_property_type = "PROPERTY TYPE"
|
||||
landlord_built_form = None # Skipped as empty
|
||||
landlord_wall_construction = "wall combined" # combin F + G
|
||||
landlord_roof_construction = "HEATING SYSTEM" # Combine I + J
|
||||
landlord_heating_system = None # Check with Khalim
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "LLUPRN"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
outcomes_postcode = None
|
||||
outcomes_houseno = None
|
||||
outcomes_id = None
|
||||
outcomes_address = None
|
||||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
||||
# Peabody data for cleaning
|
||||
data_folder = (
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
"Project/data_validation"
|
||||
)
|
||||
data_filename = "to_standardise_uprns.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = "Postcode"
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
fulladdress_column = "Address"
|
||||
address_cols_to_concat = None
|
||||
>>>>>>> d4064da36565f87c2b72d10e9f3604cc6c37bdb6
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "UHTprop Ref"
|
||||
landlord_property_id = "UPRN"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
|
|
@ -286,7 +242,7 @@ def app():
|
|||
if skip is not None and not force_retrieve_data:
|
||||
if i <= skip:
|
||||
continue
|
||||
chunk = asset_list.standardised_asset_list[i: i + chunk_size]
|
||||
chunk = asset_list.standardised_asset_list[i : i + chunk_size]
|
||||
epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
|
||||
df=chunk,
|
||||
row_id_name=asset_list.DOMNA_PROPERTY_ID,
|
||||
|
|
@ -429,7 +385,7 @@ def app():
|
|||
# Retrieve just the data we need
|
||||
epc_df = epc_df[
|
||||
[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
|
||||
].rename(columns=asset_list.EPC_API_DATA_NAMES)
|
||||
].rename(columns=asset_list.EPC_API_DATA_NAMES)
|
||||
|
||||
# Look for columns not in the find my EPC data, which will have happened if we didn't
|
||||
# retrieve it in the first place
|
||||
|
|
@ -446,7 +402,7 @@ def app():
|
|||
find_my_epc_data[
|
||||
[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]
|
||||
+ list(asset_list.FIND_EPC_DATA_NAMES.keys())
|
||||
].rename(columns=asset_list.FIND_EPC_DATA_NAMES),
|
||||
].rename(columns=asset_list.FIND_EPC_DATA_NAMES),
|
||||
how="left",
|
||||
on=asset_list.DOMNA_PROPERTY_ID,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -20,6 +20,29 @@ if EPC_AUTH_TOKEN is None:
|
|||
raise RuntimeError("EPC_AUTH_TOKEN not defined in env")
|
||||
|
||||
|
||||
def is_valid_postcode(postcode_clean: str) -> bool:
|
||||
"""
|
||||
Validate postcode using postcodes.io.
|
||||
|
||||
Expects a sanitised postcode (e.g. E84SQ).
|
||||
Returns True if valid, False otherwise.
|
||||
"""
|
||||
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
|
||||
if not postcode_clean:
|
||||
return False
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
|
||||
timeout=5,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("result", False)
|
||||
except requests.RequestException:
|
||||
# Network issues, rate limits, etc.
|
||||
return False
|
||||
|
||||
|
||||
def levenshtein(a: str, b: str) -> float:
|
||||
"""
|
||||
Address similarity score in [0, 1].
|
||||
|
|
|
|||
|
|
@ -78,112 +78,14 @@ def sanitise_postcode(postcode: str) -> str | None:
|
|||
return postcode.upper().replace(" ", "")
|
||||
|
||||
|
||||
def is_valid_postcode(postcode_clean: str) -> bool:
|
||||
"""
|
||||
Validate postcode using postcodes.io.
|
||||
|
||||
Expects a sanitised postcode (e.g. E84SQ).
|
||||
Returns True if valid, False otherwise.
|
||||
"""
|
||||
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
|
||||
if not postcode_clean:
|
||||
return False
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
|
||||
timeout=5,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("result", False)
|
||||
except requests.RequestException:
|
||||
# Network issues, rate limits, etc.
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability")
|
||||
df = df.head(500)
|
||||
|
||||
# Sanitise postcodes
|
||||
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
|
||||
|
||||
# --- validate AFTER grouping (save API calls) ---
|
||||
|
||||
# Get unique, non-null postcodes
|
||||
unique_postcodes = df["postcode_clean"].dropna().unique()
|
||||
|
||||
# Validate each postcode once, TODOadd a progress bar
|
||||
postcode_validity = {
|
||||
pc: is_valid_postcode(pc)
|
||||
for pc in tqdm(unique_postcodes, total=len(unique_postcodes))
|
||||
}
|
||||
|
||||
# Map validity back onto dataframe
|
||||
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
|
||||
|
||||
results = []
|
||||
|
||||
for postcode, group_df in tqdm(
|
||||
df[df["postcode_valid"]].groupby("postcode_clean"),
|
||||
desc="Resolving UPRNs by postcode",
|
||||
):
|
||||
try:
|
||||
epc_df = get_epc_data_with_postcode(postcode)
|
||||
|
||||
if epc_df.empty:
|
||||
tmp = group_df.copy()
|
||||
tmp["found_uprn"] = None
|
||||
tmp["status"] = "no_epc_results"
|
||||
results.append(tmp)
|
||||
continue
|
||||
|
||||
resolved = resolve_uprns_for_postcode_group(
|
||||
group_df=group_df,
|
||||
epc_df=epc_df,
|
||||
)
|
||||
|
||||
results.append(resolved)
|
||||
|
||||
except Exception as e:
|
||||
tmp = group_df.copy()
|
||||
tmp["found_uprn"] = None
|
||||
tmp["status"] = "exception"
|
||||
tmp["error"] = str(e)
|
||||
results.append(tmp)
|
||||
|
||||
final_df = pd.concat(results, ignore_index=True)
|
||||
a = final_df[
|
||||
[
|
||||
"best_match_lexiscore",
|
||||
"Address 1",
|
||||
"best_match_address",
|
||||
"Postcode",
|
||||
"UPRN",
|
||||
"best_match_uprn",
|
||||
]
|
||||
] # add levi score to viewing
|
||||
b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing
|
||||
b = b[
|
||||
[
|
||||
"best_match_lexiscore",
|
||||
"Address 1",
|
||||
"best_match_address",
|
||||
"Postcode",
|
||||
"UPRN",
|
||||
"best_match_uprn",
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def handler(event, context):
|
||||
def handler(event, context, local=False):
|
||||
print(f"Function: {context.function_name}")
|
||||
print(f"Request ID: {context.aws_request_id}")
|
||||
|
||||
# Example SQS message for testing (copy and paste into SQS):
|
||||
# {
|
||||
# "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
|
||||
# "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv",
|
||||
# "task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917",
|
||||
# "s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"
|
||||
# }
|
||||
|
||||
# Handle both single event and batch events (SQS, etc.)
|
||||
|
|
@ -196,7 +98,13 @@ def handler(event, context):
|
|||
task_id = None
|
||||
subtask_id = None
|
||||
try:
|
||||
# Parse body
|
||||
# For local development
|
||||
if local is True:
|
||||
record = {}
|
||||
record["body"] = (
|
||||
'{"task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917","s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"}'
|
||||
)
|
||||
# Parse body (inputs)
|
||||
if isinstance(record.get("body"), str):
|
||||
body = json.loads(record["body"])
|
||||
else:
|
||||
|
|
@ -236,17 +144,33 @@ def handler(event, context):
|
|||
df = pd.DataFrame(csv_data)
|
||||
logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns")
|
||||
|
||||
# Get head for demo
|
||||
df_head = df.head()
|
||||
logger.info("DataFrame head:")
|
||||
logger.info(f"\n{df_head}")
|
||||
# Sanitise postcodes
|
||||
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
|
||||
|
||||
# Group by sanitised postcode (excluding null values)
|
||||
grouped_data = []
|
||||
for postcode, group_df in df.dropna(subset=["postcode_clean"]).groupby(
|
||||
"postcode_clean"
|
||||
):
|
||||
group_info = {
|
||||
"postcode": postcode,
|
||||
"row_count": len(group_df),
|
||||
"rows": group_df.to_dict(orient="records"),
|
||||
}
|
||||
grouped_data.append(group_info)
|
||||
logger.info(f"Postcode: {postcode}, Rows: {len(group_df)}")
|
||||
|
||||
logger.info(f"Total postcodes: {len(grouped_data)}")
|
||||
|
||||
results.append(
|
||||
{
|
||||
"message": "Postcode splitter processing started",
|
||||
"message": "Postcode splitter processing completed",
|
||||
"task_id": str(task_id),
|
||||
"s3_uri": s3_uri,
|
||||
"subtask_id": str(subtask_id),
|
||||
"total_rows": len(df),
|
||||
"total_postcodes": len(grouped_data),
|
||||
"grouped_data": grouped_data,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -258,6 +182,7 @@ def handler(event, context):
|
|||
"status": "processing_complete",
|
||||
"s3_uri": s3_uri,
|
||||
"rows_processed": len(df),
|
||||
"total_postcodes": len(grouped_data),
|
||||
},
|
||||
)
|
||||
logger.info(f"Subtask {subtask_id} marked as complete")
|
||||
|
|
@ -295,7 +220,3 @@ def handler(event, context):
|
|||
{"processed": results, "errors": errors if errors else None}
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue